In [1]:
//default_exp rnn

# rnn

> Implement some of the language models in https://github.com/fastai/fastbook/blob/master/12_nlp_dive.ipynb.

## Set-up data used in tests / demos
    
Using [human_numbers.tgz](https://s3.amazonaws.com/fast-ai-sample/human_numbers.tgz), I created an even smaller dataset with the following;

```
lines = []
with open('data/human_numbers/train.txt') as f: lines.extend(f.readlines())
with open('data/human_numbers/valid.txt') as f: lines.extend(f.readlines())
with open('data/human_numbers/train_and_valid.txt','w') as f:
    f.write(' . '.join([l.strip() for l in lines[:2000]]))
```

In [2]:
const text=require('fs').readFileSync('data/human_numbers/train_and_valid.txt').toString();
text.substring(0,100);

one . two . three . four . five . six . seven . eight . nine . ten . eleven . twelve . thirteen . fo


In [3]:
const tokens=text.split(' ');
const vocab = [...new Set(tokens)];
vocab

[
  'one',      '.',        'two',
  'three',    'four',     'five',
  'six',      'seven',    'eight',
  'nine',     'ten',      'eleven',
  'twelve',   'thirteen', 'fourteen',
  'fifteen',  'sixteen',  'seventeen',
  'eighteen', 'nineteen', 'twenty',
  'thirty',   'forty',    'fifty',
  'sixty',    'seventy',  'eighty',
  'ninety',   'hundred',  'thousand'
]


In [4]:
const word2idx={};
vocab.forEach((word,idx)=>word2idx[word]=idx);
word2idx

{
  one: 0,
  '.': 1,
  two: 2,
  three: 3,
  four: 4,
  five: 5,
  six: 6,
  seven: 7,
  eight: 8,
  nine: 9,
  ten: 10,
  eleven: 11,
  twelve: 12,
  thirteen: 13,
  fourteen: 14,
  fifteen: 15,
  sixteen: 16,
  seventeen: 17,
  eighteen: 18,
  nineteen: 19,
  twenty: 20,
  thirty: 21,
  forty: 22,
  fifty: 23,
  sixty: 24,
  seventy: 25,
  eighty: 26,
  ninety: 27,
  hundred: 28,
  thousand: 29
}


In [5]:
const nums=tokens.map(e=>word2idx[e]);
nums

[
   0,  1,  2,  1,  3, 1,  4,  1,  5,  1,  6, 1,
   7,  1,  8,  1,  9, 1, 10,  1, 11,  1, 12, 1,
  13,  1, 14,  1, 15, 1, 16,  1, 17,  1, 18, 1,
  19,  1, 20,  1, 20, 0,  1, 20,  2,  1, 20, 3,
   1, 20,  4,  1, 20, 5,  1, 20,  6,  1, 20, 7,
   1, 20,  8,  1, 20, 9,  1, 21,  1, 21,  0, 1,
  21,  2,  1, 21,  3, 1, 21,  4,  1, 21,  5, 1,
  21,  6,  1, 21,  7, 1, 21,  8,  1, 21,  9, 1,
  22,  1, 22,  0,
  ... 10921 more items
]


In [6]:
const tokenCounter={};
vocab.forEach(token=>tokenCounter[token]=0);
tokens.forEach(token=>tokenCounter[token]++);
let mostCommonToken=[null,0];
for (let key in tokenCounter) {
    if (tokenCounter[key]>mostCommonToken[1]) {
        mostCommonToken=[key,tokenCounter[key]];
    }
}
console.log('mostCommonToken', mostCommonToken, mostCommonToken[1]/tokens.length);

mostCommonToken [ '.', 1999 ] 0.18138099990926412


&uarr; if we can get better than 0.18 accuracy, our model will be doing better than predicting the most common token

In [7]:
/**
Imports we need in rnn.module.js
*/
import {round,flatten,exp,shape,transpose,dotProduct,randn,uniform,full,zeros} from './src/util.module.js';
import {mean,reshape,argmax,normalize,identity,meanAndStandardDeviation} from './src/util.module.js';
import {matrixSum1d,matrixSum2d,matrixSubtract1d,matrixSubtract2d,matrixMultiply1d,matrixMultiply2d} from './src/util.module.js';
import {head,tail,shuffle,split,batches} from './src/data.module.js';
import {accuracy} from './src/nn.module.js';
import {CrossEntropyLoss} from './src/nn.module.js';

In [8]:
// Imports we need for testing
import {testEq} from './src/testutil.module.js'

## Convert a sequence of tokens into a data structure that our language models can use

In [9]:
/**
Convert a 1d array of numbers (sequence of word indices) to a 2d array of shape [sequenceLength+1, nums.length/sequenceLength].
This makes it easy to iterate over the 1st dimension of "data" to access a chunk of "nums", one timestep at a time.
*/
function toData(nums,sequenceLength) {
    const data=full(sequenceLength+1).map(e=>[]);
    const iMax=nums.length-sequenceLength;
    for (let i=0; i<iMax; i+=sequenceLength) {
        for (let j=0; j<sequenceLength+1; j++) {
            data[j].push(nums[i+j]);
        }
    }
    return data;
}

In [10]:
testEq([3,Math.floor(nums.length/2)],shape(toData(nums,2)));
testEq([4,Math.floor(nums.length/3)],shape(toData(nums,3)));
testEq([5,Math.floor(nums.length/4)],shape(toData(nums,4)));
testEq([17,Math.floor(nums.length/16)],shape(toData(nums,16)));

## Implement "multi-call" ReLU, Linear and Embedding layers

In the `nn` module, we implement these layers but they can only be used by models that call `forward` and `backward` once for each call to `update`.

Our language models will make multiple calls to `forward`, followed by the same number of calls to `backward` for each call to `update`.

In [11]:
/**
*/
class ReLU {
    constructor() {
        this.gradMasks=[];
    }
    forward(x2d) {
        const gradMask=zeros(...shape(x2d));
        this.gradMasks.push(gradMask);
        return x2d.map((x1d,rowIndex) => x1d.map((x,colIndex) => {
            if (x>0) {
                gradMask[rowIndex][colIndex]=1;
            }
            return Math.max(0,x);
        }));
    }
    backward(gradient) {
        if (this.gradMasks.length <= 0) {
            throw `ReLU: backward has been called too many times`;
        }
        return matrixMultiply2d(this.gradMasks.pop(),gradient);
    }
    update(lr) {
        if (this.gradMasks.length != 0) {
            throw new Error(`ReLU: forward has been called ${this.gradMasks.length} times more than backward`);
        }
    }
}

In [12]:
/**
*/
function _matrixSum2d(a,b) {
    return (b == null) ? a : matrixSum2d(a,b);
}
function _matrixSum1d(a,b) {
    return (b == null) ? a : matrixSum1d(a,b);
}

Note: For both `Linear` and `Embedding`, `backward` doesn't scale gradients relative to batch size, and `update` doesn't adjust `lr` to account for batch size either. So ... if we change batch size, we'll probably need to adjust learning rates too.

In [13]:
/**
Applies a linear transformation to `x`.
*/
class Linear {
    constructor(inputDim,numHidden=1,bias=true) {
        this.inputDim=inputDim;
        this.numHidden=numHidden;
        this.weights=matrixMultiply2d(randn(inputDim,numHidden), Math.sqrt(2.0/inputDim));
        this.bias=zeros(numHidden)
        this.updateBias=bias;
        this.xHistory=[];
        this.weightsGradient=null;
        this.biasGradient=null;
        this.label=`${this.constructor.name}(${this.inputDim},${this.numHidden})`;
    }
    forward(x) {
        this.xHistory.push(x);
        return matrixSum2d(dotProduct(x,this.weights), this.bias);
    }
    backward(gradient) {
        if (this.xHistory.length <= 0) {
            throw `${this.label}: backward has been called too many times`;
        }
        let weightsGradient=dotProduct(transpose(this.xHistory.pop()), gradient);
        this.weightsGradient=_matrixSum2d(weightsGradient,this.weightsGradient);
        let biasGradient=transpose(gradient).map(col => col.reduce((a,b) => a+b));
        this.biasGradient=_matrixSum1d(biasGradient,this.biasGradient);
        return dotProduct(gradient,transpose(this.weights)); // xGradient
    }
    update(lr) {
        if (this.xHistory.length != 0) {
            throw new Error(`${this.label}: forward has been called ${this.xHistory.length} times more than backward`);
        }
        this.weights=matrixSubtract2d(this.weights,matrixMultiply2d(this.weightsGradient,lr));
        if (this.updateBias) {
            this.bias=matrixSubtract1d(this.bias,matrixMultiply1d(this.biasGradient,lr));
        }
        this.weightsGradient=null;
        this.biasGradient=null;
    }
}

In [14]:
/**
Using
- `Embedding` when `x` is an array of IDs or
- `Linear` when `x` is a one-hot encoded matrix
should give the same results - but `Embedding` should be faster.
*/
class Embedding extends Linear {
    constructor(inputDim,numHidden=1,bias=true) {
        super(inputDim,numHidden,bias);
        this.weights=uniform(inputDim,numHidden,-1,1);
    }
    forward(x) {
        this.xHistory.push(x);
        return matrixSum2d(x.map(i=>this.weights[i]), this.bias);
    }
    backward(gradient) {
        if (this.xHistory.length <= 0) {
            throw `${this.label}: backward has been called too many times`;
        }
        let weightsGradient=zeros(this.inputDim,this.numHidden);
        let x=this.xHistory.pop();
        for (let i=0; i<this.inputDim; i++) {
            x.map((row, rowIndex)=>{
                if (row == i) {
                    weightsGradient[i]=matrixSum1d(weightsGradient[i],gradient[rowIndex]);
                }
            })
        }
        this.weightsGradient=_matrixSum2d(weightsGradient,this.weightsGradient);
        let biasGradient=transpose(gradient).map(col => col.reduce((a,b) => a+b));
        this.biasGradient=_matrixSum1d(biasGradient,this.biasGradient);
        return dotProduct(gradient,transpose(this.weights));
    }
}

## First recurrent model - predict next word from previous 3 words

In [15]:
class LMModel2 {
    constructor(vocab_sz, n_hidden, sequenceLength) {
        this.sequenceLength = sequenceLength || 3;
        this.i_h = new Linear(vocab_sz, n_hidden);
        this.h_h = new Linear(n_hidden, n_hidden);
        this.h_o = new Linear(n_hidden, vocab_sz);
        this.non_linear = new ReLU();
        this.oneHotLookup = normalize(identity(vocab_sz));
    }
    
    toOneHot(x) {
        return x.map(e=>this.oneHotLookup[e]);
    }
    
    forward(x) {
        let h=0;
        for (let i=0; i<this.sequenceLength; i++) {
            h = matrixSum2d(this.i_h.forward(this.toOneHot(x[i])), h);
            h = this.non_linear.forward(this.h_h.forward(h));
        }
        return this.h_o.forward(h);
    }
    
    backward(gradients) {
        let g=this.h_o.backward(gradients);
        for (let i=this.sequenceLength; i>0; i--) {
            g=this.non_linear.backward(g);
            g=this.h_h.backward(g);
            this.i_h.backward(g);
        }
    }
    
    update(lr) {
        [this.i_h,this.h_h,this.h_o,this.non_linear].forEach(layer=>layer.update(lr));
    }
}

`constructor`
- normalizing the one-hot lookup seems to make training more stable and slightly more accurate

`forward`
- If `i_h` was an embedding we wouldn't need `toOneHot`
- When we `matrixSum2d(i_h.forward(), h)` we put `h` on the right so that the initial value `0` is re-shaped to match the shape of the output of `i_h` (which is `[n_hidden,n_hidden]`)



In [16]:
// train a model that predicts the next word after any number of input words
function train(model_fn,data,dropLastBatch=false,shuffleBatch=true) {
    console.log('Training model',model_fn);
    let lossFn=new CrossEntropyLoss();
    let model=new model_fn(vocab.length,28,data.length-1); // data.length-1 is sequence length
    let lossValues=[];
    let accuracyValues=[];
    for (let epoch=0; epoch<10; epoch++) {
        batches(data,64,dropLastBatch,shuffleBatch).forEach(batch => {
            const xb=batch; // the model will look at only sequence length tokens
            const yb=batch[model.sequenceLength];
            let preds=model.forward(xb);
            let lossValue=lossFn.forward(preds,yb);
            lossValues.push(lossValue);
            accuracyValues.push(accuracy(preds,yb));
            model.backward(lossFn.backward());
            model.update(3e-3);
        });
        console.log('epoch',epoch,new Date(),'train loss',round(mean(lossValues),3),
                    'accuracy',round(mean(accuracyValues),3));
        lossValues=[];
        accuracyValues=[];
        if (model.reset) {
            model.reset();
        }
    }
}

Train a model to predict the next word after 3 input words

In [17]:
train(LMModel2,toData(nums,3));

Training model [class LMModel2]
epoch 0 2021-05-18T10:28:09.423Z train loss 2.011 accuracy 0.493
epoch 1 2021-05-18T10:28:10.368Z train loss 1.45 accuracy 0.562
epoch 2 2021-05-18T10:28:11.291Z train loss 1.391 accuracy 0.561
epoch 3 2021-05-18T10:28:12.258Z train loss 1.358 accuracy 0.562
epoch 4 2021-05-18T10:28:13.250Z train loss 1.344 accuracy 0.569
epoch 5 2021-05-18T10:28:14.345Z train loss 1.351 accuracy 0.561
epoch 6 2021-05-18T10:28:15.401Z train loss 1.338 accuracy 0.565
epoch 7 2021-05-18T10:28:16.460Z train loss 1.326 accuracy 0.558
epoch 8 2021-05-18T10:28:17.608Z train loss 1.322 accuracy 0.559
epoch 9 2021-05-18T10:28:18.777Z train loss 1.313 accuracy 0.561


Train a model to predict the next word after 4 input words &darr; to show this model and training approach are not just doing well because of the sequence length

In [18]:
train(LMModel2,toData(nums,4));

Training model [class LMModel2]
epoch 0 2021-05-18T10:28:19.862Z train loss 2.034 accuracy 0.486
epoch 1 2021-05-18T10:28:20.929Z train loss 1.52 accuracy 0.537
epoch 2 2021-05-18T10:28:22.054Z train loss 1.415 accuracy 0.554
epoch 3 2021-05-18T10:28:23.086Z train loss 1.394 accuracy 0.545
epoch 4 2021-05-18T10:28:24.148Z train loss 1.375 accuracy 0.555
epoch 5 2021-05-18T10:28:25.210Z train loss 1.36 accuracy 0.556
epoch 6 2021-05-18T10:28:26.224Z train loss 1.381 accuracy 0.558
epoch 7 2021-05-18T10:28:27.252Z train loss 1.382 accuracy 0.549
epoch 8 2021-05-18T10:28:28.268Z train loss 1.323 accuracy 0.557
epoch 9 2021-05-18T10:28:29.297Z train loss 1.309 accuracy 0.567


## LMModel2 with an `Embedding`

Using an embedding for the input to hidden layer makes training faster but ... if we one-hot encoded x, rather than call `toOneHot` in `LMModel2#forward`, we might see less of a difference.

In [19]:
class LMModel2_2 extends LMModel2 {
    constructor(vocab_sz, n_hidden, sequenceLength) {
        super(vocab_sz, n_hidden, sequenceLength);
        this.i_h=new Embedding(vocab_sz, n_hidden);
        this.i_h.weights=uniform(vocab_sz, n_hidden,-2,2);
    }
    
    forward(x) {
        let h=0;
        for (let i=0; i<3; i++) {
            /* Use this chunk (and comment the line below) to see stats of i_h output
            const _h=this.i_h.forward(x[i]);
            console.log(meanAndStandardDeviation(flatten(_h)));
            h = matrixSum2d(_h, h);
            */
            h = matrixSum2d(this.i_h.forward(x[i]), h);
            h = this.non_linear.forward(this.h_h.forward(h));
        }
        return this.h_o.forward(h);
    }
    
}

In [20]:
train(LMModel2_2,toData(nums,3));

Training model [class LMModel2_2 extends LMModel2]
epoch 0 2021-05-18T10:28:30.173Z train loss 1.927 accuracy 0.493
epoch 1 2021-05-18T10:28:30.979Z train loss 1.49 accuracy 0.553
epoch 2 2021-05-18T10:28:31.931Z train loss 1.423 accuracy 0.552
epoch 3 2021-05-18T10:28:32.738Z train loss 1.391 accuracy 0.554
epoch 4 2021-05-18T10:28:33.607Z train loss 1.366 accuracy 0.557
epoch 5 2021-05-18T10:28:34.413Z train loss 1.355 accuracy 0.566
epoch 6 2021-05-18T10:28:35.137Z train loss 1.344 accuracy 0.564
epoch 7 2021-05-18T10:28:35.895Z train loss 1.333 accuracy 0.558
epoch 8 2021-05-18T10:28:36.693Z train loss 1.32 accuracy 0.565
epoch 9 2021-05-18T10:28:37.493Z train loss 1.315 accuracy 0.562


# Maintaining the State of an RNN

Organise data so the model sees contiguous text over subsequent batches

In [21]:
/**
*/
function groupChunks(ds,bs=64) {
    const m = Math.floor(ds[0].length/bs);
    const newDs = [...Array(ds.length).keys()].map(i=>[]);
    for (let i=0; i<m; i++) {
        for (let j=0; j<bs; j++) {
            for (let k=0; k<ds.length; k++) {
                newDs[k].push(ds[k][i + m*j]);
            }
        }
    }
    return newDs;
}

In [22]:
let _dataTemp = toData(nums,3);
let _data=groupChunks(_dataTemp);

console.log(shape(_data));
let _batches=batches(_data,64,true,false);
for (let b=0; b<3; b++) {
    let _batch=_batches[b];
    [...Array(5).keys()].forEach(i => {
        console.log(b,i,'x',vocab[_batch[0][i]],vocab[_batch[1][i]],vocab[_batch[2][i]],'y',vocab[_batch[3][i]]);
    });
}

[ 4, 3648 ]
0 0 x one . two y .
0 1 x sixty six . y sixty
0 2 x hundred eighteen . y one
0 3 x three . one y hundred
0 4 x eighty eight . y one
1 0 x . three . y four
1 1 x sixty seven . y sixty
1 2 x one hundred nineteen y .
1 3 x hundred fifty four y .
1 4 x one hundred eighty y nine
2 0 x four . five y .
2 1 x sixty eight . y sixty
2 2 x . one hundred y twenty
2 3 x . one hundred y fifty
2 4 x nine . one y hundred


here's how we could move the initialization of `h` out of `forward` &darr;

In [23]:
class LMModel3 extends LMModel2 {
    constructor(vocab_sz, n_hidden, sequenceLength) {
        super(vocab_sz, n_hidden, sequenceLength);
        this.h=0;
    }
    
    forward(x) {
        for (let i=0; i<3; i++) {
            this.h = matrixSum2d(this.i_h.forward(this.toOneHot(x[i])), this.h);
            this.h = this.non_linear.forward(this.h_h.forward(this.h));
        }
        return this.h_o.forward(this.h);
    }
        
    reset() {
        this.h=0;
    }
}

without organising the data so the model sees contiguous text over subsequent batches, we loose a little accuracy (~2%) but i thought we'd loose more. Maybe if we were looking at validation accuracy we'd see more of a difference.

organising the data with `groupChunks` improves a little over our fist model accuracy.

In [24]:
// train(LMModel3,toData(nums,3));
train(LMModel3,groupChunks(toData(nums,3)),true,false);

Training model [class LMModel3 extends LMModel2]
epoch 0 2021-05-18T10:28:38.814Z train loss 1.985 accuracy 0.477
epoch 1 2021-05-18T10:28:39.896Z train loss 1.446 accuracy 0.559
epoch 2 2021-05-18T10:28:40.952Z train loss 1.376 accuracy 0.56
epoch 3 2021-05-18T10:28:42.019Z train loss 1.333 accuracy 0.567
epoch 4 2021-05-18T10:28:43.014Z train loss 1.291 accuracy 0.57
epoch 5 2021-05-18T10:28:44.038Z train loss 1.245 accuracy 0.576
epoch 6 2021-05-18T10:28:45.090Z train loss 1.198 accuracy 0.585
epoch 7 2021-05-18T10:28:46.122Z train loss 1.157 accuracy 0.589
epoch 8 2021-05-18T10:28:47.123Z train loss 1.117 accuracy 0.597
epoch 9 2021-05-18T10:28:48.163Z train loss 1.091 accuracy 0.6


we should expect similar results when `i_h` is an embedding

In [25]:
class LMModel3_2 extends LMModel2_2 {
    constructor(vocab_sz, n_hidden, sequenceLength) {
        super(vocab_sz, n_hidden, sequenceLength);
        this.h=0;
    }
    
    forward(x) {
        for (let i=0; i<3; i++) {
            this.h = matrixSum2d(this.i_h.forward(x[i]), this.h);
            this.h = this.non_linear.forward(this.h_h.forward(this.h));
        }
        return this.h_o.forward(this.h);
    }
        
    reset() {
        this.h=0;
    }
}

In [26]:
train(LMModel3_2,groupChunks(toData(nums,3)),true,false);

Training model [class LMModel3_2 extends LMModel2_2]
epoch 0 2021-05-18T10:28:49.045Z train loss 2.116 accuracy 0.452
epoch 1 2021-05-18T10:28:49.805Z train loss 1.481 accuracy 0.546
epoch 2 2021-05-18T10:28:50.563Z train loss 1.415 accuracy 0.558
epoch 3 2021-05-18T10:28:51.324Z train loss 1.376 accuracy 0.566
epoch 4 2021-05-18T10:28:52.084Z train loss 1.345 accuracy 0.568
epoch 5 2021-05-18T10:28:52.854Z train loss 1.314 accuracy 0.572
epoch 6 2021-05-18T10:28:53.608Z train loss 1.283 accuracy 0.575
epoch 7 2021-05-18T10:28:54.397Z train loss 1.253 accuracy 0.577
epoch 8 2021-05-18T10:28:55.172Z train loss 1.228 accuracy 0.582
epoch 9 2021-05-18T10:28:55.933Z train loss 1.204 accuracy 0.585


# Flattening/un-flattening multi-dimensional arrays

Up until now, our language models have returned a result of size `bs x vocab_sz`.

Our next model will output a prediction per-timestep, so its result will be `sequence_length x bs x vocab_sz`. Note: Most deep learning code would make `bs` the 1st dimension, but it makes a this code more simple to deviate from this convention.

We use `Flatten` below, to reshape data so that we can use `CrossEntropyLoss` and `accuracy` without modification.
`Flatten` can also reverse the reshape, so we can take the signal returned by `CrossEntropyLoss` loss and feed it into the models `backward` function.

In [27]:
/**
*/
class Flatten {
    forward(x) {
        this.originalShape=shape(x);
        return [].concat(...x);
    }
    backward(x) {
        const result=[];
        for (let i=0; i<this.originalShape[0]; i++) {
            const startFrom=i*this.originalShape[1];
            result.push(x.slice(startFrom,startFrom+this.originalShape[1]));
        }
        return result;
    }
}

The following test shows that we can use the same flatten for multiple arrays, as long as the first 2 dimensions of the arrays are the same.

In [28]:
let a=[],b=[];
for (let i=0; i<3; i++) {
    a.push([]); b.push([]);
    for (let j=0; j<2; j++) {
        a[a.length-1].push(`${i}.${j}`);
        let _b=[]; b[b.length-1].push(_b);
        for (let k=0; k<4; k++) {
            _b.push(`${i}.${j}.${k}`);
        }
    }
}
let _a=JSON.stringify(a),_b=JSON.stringify(b); 
let flatten=new Flatten();
let aFlat=flatten.forward(a);
let bFlat=flatten.forward(b);
testEq([6],shape(aFlat));
testEq([6,4],shape(bFlat));
let aUnflat=flatten.backward(aFlat);
let bUnflat=flatten.backward(bFlat);
testEq([3,2],shape(aUnflat));
testEq([3,2,4],shape(bUnflat));
testEq(_a,JSON.stringify(aUnflat));
testEq(_b,JSON.stringify(bUnflat));

# Creating More Signal

In [29]:
class LMModel4 {
    constructor(vocab_sz, n_hidden) {
        this.i_h=new Embedding(vocab_sz, n_hidden);
        this.i_h.weights=uniform(vocab_sz, n_hidden,-2,2);
        this.h_h = new Linear(n_hidden, n_hidden);
        this.h_o = new Linear(n_hidden, vocab_sz);
        this.non_linear = new ReLU();
        this.h=0;
    }
    
    forward(x) {
        return x.map(_x => {
            this.h = matrixSum2d(this.i_h.forward(_x), this.h);
            this.h = this.non_linear.forward(this.h_h.forward(this.h));
            return this.h_o.forward(this.h);
        });
    }
    
    backward(gradients) {
        let g=null;
        for (let i=gradients.length-1; i>=0; i--) {
            let _g=this.h_o.backward(gradients[i]);
            g=_matrixSum2d(_g,g);
            g=this.non_linear.backward(g);
            g=this.h_h.backward(g);
            this.i_h.backward(g);
        }
    }
        
    update(lr) {
        [this.i_h,this.h_h,this.h_o,this.non_linear].forEach(layer=>layer.update(lr));
    }
    
    reset() {
        this.h=0;
    }
}

In [37]:
function train2(model,data) {
    let bs=64;
    data=groupChunks(data,bs);
    console.log('Training model',model.constructor.name,'n_hidden',model.i_h.numHidden,
                'shape(data)',shape(data),'sequenceLength',data.length-1);
    let lossFn=new CrossEntropyLoss();
    let flatten=new Flatten();
    let lossValues=[]; // it's ok to take the mean of lossValues/accuracyValues as all batches are the same size
    let accuracyValues=[];
    let lastAccuracyValues=[];
    for (let epoch=0; epoch<15; epoch++) {
        batches(data,bs,true,false).forEach(batch => {
            const xb=batch.slice(0,batch.length-1);
            const ybFlat=flatten.forward(batch.slice(1));
            const preds=model.forward(xb);
            const predsFlat=flatten.forward(preds);
            const lossValue=lossFn.forward(predsFlat,ybFlat);
            lossValues.push(lossValue);
            accuracyValues.push(accuracy(predsFlat,ybFlat));
            // track accuracy on just the last token - to compare with previous models
            lastAccuracyValues.push(accuracy(preds[preds.length-1],batch[batch.length-1]));
            model.backward(flatten.backward(lossFn.backward()));
            model.update(3e-4);
        });
        console.log('epoch',epoch,new Date(),'train loss',round(mean(lossValues),3),
                    'accuracy',round(mean(accuracyValues),3),
                    'last token accuracy',round(mean(lastAccuracyValues),3));
        lossValues=[];
        accuracyValues=[];
        lastAccuracyValues=[];
        model.reset();
    }
}

In [39]:
train2(new LMModel4(vocab.length,28), toData(nums,3));

Training model LMModel4 n_hidden 28 shape(data) [ 4, 3648 ] sequenceLength 3
epoch 0 2021-05-18T10:34:00.543Z train loss 2.784 accuracy 0.345 last token accuracy 0.346
epoch 1 2021-05-18T10:34:06.482Z train loss 1.92 accuracy 0.49 last token accuracy 0.49
epoch 2 2021-05-18T10:34:12.139Z train loss 1.709 accuracy 0.519 last token accuracy 0.526
epoch 3 2021-05-18T10:34:17.721Z train loss 1.603 accuracy 0.532 last token accuracy 0.535
epoch 4 2021-05-18T10:34:23.051Z train loss 1.546 accuracy 0.538 last token accuracy 0.54
epoch 5 2021-05-18T10:34:28.379Z train loss 1.509 accuracy 0.541 last token accuracy 0.542
epoch 6 2021-05-18T10:34:33.759Z train loss 1.483 accuracy 0.544 last token accuracy 0.546
epoch 7 2021-05-18T10:34:39.090Z train loss 1.464 accuracy 0.547 last token accuracy 0.548
epoch 8 2021-05-18T10:34:44.552Z train loss 1.45 accuracy 0.548 last token accuracy 0.551
epoch 9 2021-05-18T10:34:49.926Z train loss 1.437 accuracy 0.549 last token accuracy 0.552
epoch 10 2021-05-1

In [41]:
train2(new LMModel4(vocab.length,64), toData(nums,8));

Training model LMModel4 n_hidden 64 shape(data) [ 9, 1344 ] sequenceLength 8
epoch 0 2021-05-18T10:37:25.061Z train loss 2.883 accuracy 0.402 last token accuracy 0.407
epoch 1 2021-05-18T10:37:42.164Z train loss 1.659 accuracy 0.53 last token accuracy 0.537
epoch 2 2021-05-18T10:37:59.376Z train loss 1.51 accuracy 0.544 last token accuracy 0.548
epoch 3 2021-05-18T10:38:16.514Z train loss 1.46 accuracy 0.547 last token accuracy 0.554
epoch 4 2021-05-18T10:38:33.738Z train loss 1.431 accuracy 0.55 last token accuracy 0.555
epoch 5 2021-05-18T10:38:50.825Z train loss 1.408 accuracy 0.554 last token accuracy 0.561
epoch 6 2021-05-18T10:39:08.044Z train loss 1.386 accuracy 0.556 last token accuracy 0.561
epoch 7 2021-05-18T10:39:25.119Z train loss 1.359 accuracy 0.559 last token accuracy 0.565
epoch 8 2021-05-18T10:39:42.283Z train loss 1.329 accuracy 0.565 last token accuracy 0.57
epoch 9 2021-05-18T10:39:59.578Z train loss 1.298 accuracy 0.573 last token accuracy 0.577
epoch 10 2021-05-1