In [1]:
//default_exp rnn

# rnn

> Implement some of the language models in https://github.com/fastai/fastbook/blob/master/12_nlp_dive.ipynb.

## Set-up data used in tests / demos
    
Using [human_numbers.tgz](https://s3.amazonaws.com/fast-ai-sample/human_numbers.tgz), I created an even smaller dataset with the following;

```
lines = []
with open('data/human_numbers/train.txt') as f: lines.extend(f.readlines())
with open('data/human_numbers/valid.txt') as f: lines.extend(f.readlines())
with open('data/human_numbers/train_and_valid.txt','w') as f:
    f.write(' . '.join([l.strip() for l in lines[:2000]]))
```

In [2]:
const text=require('fs').readFileSync('data/human_numbers/train_and_valid.txt').toString();
text.substring(0,100);

one . two . three . four . five . six . seven . eight . nine . ten . eleven . twelve . thirteen . fo


In [3]:
const tokens=text.split(' ');
const vocab = [...new Set(tokens)];
vocab

[
  'one',      '.',        'two',
  'three',    'four',     'five',
  'six',      'seven',    'eight',
  'nine',     'ten',      'eleven',
  'twelve',   'thirteen', 'fourteen',
  'fifteen',  'sixteen',  'seventeen',
  'eighteen', 'nineteen', 'twenty',
  'thirty',   'forty',    'fifty',
  'sixty',    'seventy',  'eighty',
  'ninety',   'hundred',  'thousand'
]


In [4]:
const word2idx={};
vocab.forEach((word,idx)=>word2idx[word]=idx);
word2idx

{
  one: 0,
  '.': 1,
  two: 2,
  three: 3,
  four: 4,
  five: 5,
  six: 6,
  seven: 7,
  eight: 8,
  nine: 9,
  ten: 10,
  eleven: 11,
  twelve: 12,
  thirteen: 13,
  fourteen: 14,
  fifteen: 15,
  sixteen: 16,
  seventeen: 17,
  eighteen: 18,
  nineteen: 19,
  twenty: 20,
  thirty: 21,
  forty: 22,
  fifty: 23,
  sixty: 24,
  seventy: 25,
  eighty: 26,
  ninety: 27,
  hundred: 28,
  thousand: 29
}


In [5]:
const nums=tokens.map(e=>word2idx[e]);
nums

[
   0,  1,  2,  1,  3, 1,  4,  1,  5,  1,  6, 1,
   7,  1,  8,  1,  9, 1, 10,  1, 11,  1, 12, 1,
  13,  1, 14,  1, 15, 1, 16,  1, 17,  1, 18, 1,
  19,  1, 20,  1, 20, 0,  1, 20,  2,  1, 20, 3,
   1, 20,  4,  1, 20, 5,  1, 20,  6,  1, 20, 7,
   1, 20,  8,  1, 20, 9,  1, 21,  1, 21,  0, 1,
  21,  2,  1, 21,  3, 1, 21,  4,  1, 21,  5, 1,
  21,  6,  1, 21,  7, 1, 21,  8,  1, 21,  9, 1,
  22,  1, 22,  0,
  ... 10921 more items
]


In [6]:
const tokenCounter={};
vocab.forEach(token=>tokenCounter[token]=0);
tokens.forEach(token=>tokenCounter[token]++);
let mostCommonToken=[null,0];
for (let key in tokenCounter) {
    if (tokenCounter[key]>mostCommonToken[1]) {
        mostCommonToken=[key,tokenCounter[key]];
    }
}
console.log('mostCommonToken', mostCommonToken, mostCommonToken[1]/tokens.length);

mostCommonToken [ '.', 1999 ] 0.18138099990926412


&uarr; if we can get better than 0.18 accuracy, our model will be doing better than predicting the most common token

In [7]:
/**
Imports we need in rnn.module.js
*/
import {round,flatten,exp,shape,transpose,dotProduct,randn,uniform,full,zeros} from './src/util.module.js';
import {mean,reshape,argmax,normalize,identity,meanAndStandardDeviation} from './src/util.module.js';
import {matrixSum1d,matrixSum2d,matrixSubtract1d,matrixSubtract2d,matrixMultiply1d,matrixMultiply2d} from './src/util.module.js';
import {head,tail,parseCsv,IRIS_CLASS_MAP,IrisRowHandler,shuffle,split,batches} from './src/data.module.js';
import {accuracy,Sigmoid,MSE,ReLU,Linear,Embedding,Learner} from './src/nn.module.js';
import {BinaryCrossEntropyLoss,CrossEntropyLoss} from './src/nn.module.js';

In [8]:
// Imports we need for testing
import {testEq} from './src/testutil.module.js'

In [9]:
/**
Convert a 1d array of numbers (sequence of word indices) to a 2d array of shape [sequenceLength+1, nums.length/sequenceLength].
This makes it easy to iterate over the 1st dimention of "data" to access a chunk of "nums", one timestep at a time.
*/
function toData(nums,sequenceLength) {
    const data=full(sequenceLength+1).map(e=>[]);
    const iMax=nums.length-sequenceLength;
    for (let i=0; i<iMax; i+=sequenceLength) {
        for (let j=0; j<sequenceLength+1; j++) {
            data[j].push(nums[i+j]);
        }
    }
    return data;
}

In [10]:
testEq([3,Math.floor(nums.length/2)],shape(toData(nums,2)));
testEq([4,Math.floor(nums.length/3)],shape(toData(nums,3)));
testEq([5,Math.floor(nums.length/4)],shape(toData(nums,4)));
testEq([17,Math.floor(nums.length/16)],shape(toData(nums,16)));

In [11]:
/**
A layer that can wrap a Linear or Embedding so that it can be called multiple times during a forward pass.
*/
class MultiCallLayer {
    constructor(layer) {
        this.layer=layer;
        this.xHistory=[];
        this.weightsGradients=null;
        this.biasGradients=null;
    }
    forward(x) {
        this.xHistory.push(x);
        return this.layer.forward(x);
    }
    _matrixSum2d(a,b) {
        return (b == null) ? a : matrixSum2d(a,b);
    }
    _matrixSum1d(a,b) {
        return (b == null) ? a : matrixSum1d(a,b);
    }
    backward(gradient) {
        if (this.xHistory.length == 0) {
            throw `this.xHistory is empty`;
        }
        this.x=this.xHistory.pop();
        this.layer.backward(gradient);
        this.weightsGradients=this._matrixSum2d(this.layer.weightsGradient,this.weightsGradients);
        this.biasGradients=this._matrixSum1d(this.layer.biasGradient,this.biasGradients);
        // Note: we're not keeping x gradients
        return this.layer.xGradient;
    }
    update(lr) {
        if (this.xHistory.length != 0) {
            throw `forward has been called ${this.xHistory.length} times more than backward`;
        }
        this.layer.weightsGradient=this.weightsGradients;
        this.layer.biasGradient=this.biasGradients;
        this.layer.update(lr);
        this.weightsGradients=null;
        this.biasGradients=null;
    }
}

## TODO: test MultiCallLinear &uarr;

## First recurrent model - predict next word from previous 3 words

In [12]:
class LMModel2 {
    constructor(vocab_sz, n_hidden, sequenceLength) {
        this.sequenceLength = sequenceLength || 3;
        this.i_h = new MultiCallLayer(new Linear(vocab_sz, n_hidden));
        this.h_h = new MultiCallLayer(new Linear(n_hidden, n_hidden));
        this.h_o = new Linear(n_hidden, vocab_sz);
        this.non_linear = new ReLU();
        this.oneHotLookup = normalize(identity(vocab_sz));
    }
    
    toOneHot(x) {
        return x.map(e=>this.oneHotLookup[e]);
    }
    
    forward(x) {
        let h=0;
        for (let i=0; i<this.sequenceLength; i++) {
            h = matrixSum2d(this.i_h.forward(this.toOneHot(x[i])), h);
            h = this.non_linear.forward(this.h_h.forward(h));
        }
        return this.h_o.forward(h);
    }
    
    backward(gradients) {
        let g=this.h_o.backward(gradients);
        for (let i=this.sequenceLength; i>0; i--) {
            g=this.non_linear.backward(g);
            g=this.h_h.backward(g);
            // TODO: matrix sum
            this.i_h.backward(g);
        }
    }
    
    update(lr) {
        this.i_h.update(lr);
        this.h_h.update(lr);
        this.h_o.update(lr);
    }
}

`constructor`
- normalizing the one-hot lookup seems to make training more stable and slightly more accurate

`forward`
- If `i_h` was an embedding we wouldn't need `toOneHot`
- When we `matrixSum2d(i_h.forward(), h)` we put `h` on the right so that the initial value `0` is re-shaped to match the shape of the output of `i_h` (which is `[n_hidden,n_hidden]`)



In [13]:
function train(model_fn,data,dropLastBatch=false,shuffleBatch=true) {
    console.log('Training model',model_fn);
    let lossFn=new CrossEntropyLoss();
    let model=new model_fn(vocab.length,28,data.length-1); // data.length-1 is sequence length
    let yTrue=data[model.sequenceLength];
    let lossValues=[];
    let accuracyValues=[];
    for (let epoch=0; epoch<15; epoch++) {
        batches(data,64,dropLastBatch,shuffleBatch).forEach(batch => {
            const xb=batch; // the model will look at only sequence length tokens
            const yb=batch[model.sequenceLength];
            let preds=model.forward(xb);
            let lossValue=lossFn.forward(preds,yb);
            lossValues.push(lossValue);
            accuracyValues.push(accuracy(preds,yb));
            model.backward(lossFn.backward());
            model.update(6e-2);
        });
        console.log('epoch',epoch,new Date(),'train loss',round(mean(lossValues),3),
                    'accuracy',round(mean(accuracyValues),3));
        if (model.reset) {
            model.reset();
        }
    }
}

Train a model to predict the next word after 3 input words

In [14]:
train(LMModel2,toData(nums,3));

Training model [class LMModel2]
epoch 0 2021-05-14T13:16:34.081Z train loss 2.518 accuracy 0.403
epoch 1 2021-05-14T13:16:35.155Z train loss 2.121 accuracy 0.461
epoch 2 2021-05-14T13:16:36.235Z train loss 1.938 accuracy 0.488
epoch 3 2021-05-14T13:16:37.440Z train loss 1.833 accuracy 0.504
epoch 4 2021-05-14T13:16:38.424Z train loss 1.765 accuracy 0.513
epoch 5 2021-05-14T13:16:39.402Z train loss 1.713 accuracy 0.52
epoch 6 2021-05-14T13:16:40.479Z train loss 1.673 accuracy 0.525
epoch 7 2021-05-14T13:16:45.092Z train loss 1.64 accuracy 0.528
epoch 8 2021-05-14T13:16:50.879Z train loss 1.612 accuracy 0.531
epoch 9 2021-05-14T13:16:57.424Z train loss 1.589 accuracy 0.535
epoch 10 2021-05-14T13:17:03.759Z train loss 1.57 accuracy 0.537
epoch 11 2021-05-14T13:17:09.520Z train loss 1.553 accuracy 0.54
epoch 12 2021-05-14T13:17:16.085Z train loss 1.538 accuracy 0.542
epoch 13 2021-05-14T13:17:22.040Z train loss 1.526 accuracy 0.543
epoch 14 2021-05-14T13:17:27.515Z train loss 1.515 accurac

Train a model to predict the next word after 4 input words &darr; to show this model and training approach are not just doing well because of the sequence length

In [15]:
train(LMModel2,toData(nums,4));

Training model [class LMModel2]
epoch 0 2021-05-14T13:17:32.786Z train loss 2.862 accuracy 0.353
epoch 1 2021-05-14T13:17:38.123Z train loss 2.38 accuracy 0.423
epoch 2 2021-05-14T13:17:43.437Z train loss 2.149 accuracy 0.458
epoch 3 2021-05-14T13:17:48.883Z train loss 2.005 accuracy 0.48
epoch 4 2021-05-14T13:17:54.129Z train loss 1.909 accuracy 0.492
epoch 5 2021-05-14T13:17:59.868Z train loss 1.84 accuracy 0.502
epoch 6 2021-05-14T13:18:05.631Z train loss 1.792 accuracy 0.508
epoch 7 2021-05-14T13:18:11.316Z train loss 1.756 accuracy 0.512
epoch 8 2021-05-14T13:18:17.161Z train loss 1.723 accuracy 0.516
epoch 9 2021-05-14T13:18:22.700Z train loss 1.696 accuracy 0.519
epoch 10 2021-05-14T13:18:28.621Z train loss 1.671 accuracy 0.523
epoch 11 2021-05-14T13:18:33.920Z train loss 1.654 accuracy 0.525
epoch 12 2021-05-14T13:18:39.283Z train loss 1.637 accuracy 0.527
epoch 13 2021-05-14T13:18:44.549Z train loss 1.619 accuracy 0.529
epoch 14 2021-05-14T13:18:51.667Z train loss 1.605 accura

### What if we normalize the signal/grads coming back from loss?

`gradients` is used in a `dotProduct` to calculate weights and x gradients or each `Linear`-  might we get better training stability by normalizing the gradients we get back from the loss function?

Adding the following to the top of `backward` shows us the mean and standard deviation of `gradients` and then normalizes them.

```
        let stats=meanAndStandardDeviation(flatten(gradients))
        console.log('grads from loss function [mean,std]',stats);
        gradients=normalize(gradients);
```

This increases the standard deviation from ~0.2 to 1.0 so we reduce `lr` by about 5x - which trains pretty much like it did before (o:

Maybe this will help more when we try longer sequence lengths?

## LMModel2 with an `Embedding`

Using an embedding for the input to hidden layer makes training ~25% faster but ... if we one-hot encoded x, rather than call `toOneHot` in `LMModel2#forward`, we might see less of a difference.

In [16]:
class LMModel2_2 extends LMModel2 {
    constructor(vocab_sz, n_hidden, sequenceLength) {
        super(vocab_sz, n_hidden, sequenceLength);
        const embedding=new Embedding(vocab_sz, n_hidden);
        embedding.weights=uniform(vocab_sz, n_hidden,-2,2);
        this.i_h = new MultiCallLayer(embedding);
    }
    
    forward(x) {
        let h=0;
        for (let i=0; i<3; i++) {
            /* Use this chunk (and comment the line below) to see stats of i_h output
            const _h=this.i_h.forward(x[i]);
            console.log(meanAndStandardDeviation(flatten(_h)));
            h = matrixSum2d(_h, h);
            */
            h = matrixSum2d(this.i_h.forward(x[i]), h);
            h = this.non_linear.forward(this.h_h.forward(h));
        }
        return this.h_o.forward(h);
    }
    
}

In [17]:
train(LMModel2_2,toData(nums,3));

Training model [class LMModel2_2 extends LMModel2]
epoch 0 2021-05-14T13:18:57.011Z train loss 2.75 accuracy 0.346
epoch 1 2021-05-14T13:19:01.124Z train loss 2.357 accuracy 0.42
epoch 2 2021-05-14T13:19:05.312Z train loss 2.16 accuracy 0.455
epoch 3 2021-05-14T13:19:09.688Z train loss 2.039 accuracy 0.477
epoch 4 2021-05-14T13:19:14.197Z train loss 1.955 accuracy 0.491
epoch 5 2021-05-14T13:19:18.204Z train loss 1.891 accuracy 0.5
epoch 6 2021-05-14T13:19:22.241Z train loss 1.839 accuracy 0.508
epoch 7 2021-05-14T13:19:26.389Z train loss 1.797 accuracy 0.513
epoch 8 2021-05-14T13:19:30.540Z train loss 1.763 accuracy 0.518
epoch 9 2021-05-14T13:19:34.702Z train loss 1.733 accuracy 0.521
epoch 10 2021-05-14T13:19:38.878Z train loss 1.708 accuracy 0.523
epoch 11 2021-05-14T13:19:43.409Z train loss 1.689 accuracy 0.526
epoch 12 2021-05-14T13:19:48.096Z train loss 1.671 accuracy 0.528
epoch 13 2021-05-14T13:19:52.406Z train loss 1.655 accuracy 0.53
epoch 14 2021-05-14T13:19:56.794Z train l

## Initializing the weights of the `Embedding`

When training with large batch sizes, initializing the weights of the embedding seems to make a big difference to how the model trains - ended up using the unusual uniform `[-2,2)` init to get similar loss/accuracy.
Using batch size of 64, we see less of a difference.

`LMModel2`: Baseline (not using Embedding)
```
[ 0.00399811878414716, 1.2616704376269678 ]                  <- [mean, stdev] of i_h output first timestep
[ -0.15787033906300632, 1.4662360727552772 ]                 <- [mean, stdev] of i_h output last timestep
epoch 19 loss 1.7658807424786231 accuracy 0.5088483528450858
```

`LMModel2_2`: Embedding using Kaiming init
```
[ -0.007255579503492108, 0.25066918887623985 ]
[ -0.013472155034471406, 0.30197370989884775 ]
epoch 19 loss 2.4118482003014603 accuracy 0.3294309828478083
```

`LMModel2_2`: Embedding using uniform `[0,1)` init
```
[ 0.5006462868294974, 0.2901404931519155 ]
[ 0.40933492469236854, 0.32056279613160615 ]
epoch 19 loss 2.29436828263747 accuracy 0.3476722025592159
```

`LMModel2_2`: Embedding using uniform `[-1,1)` init
```
[ -0.00029163015159714393, 0.5849167220165731 ]
[ -0.01253279522022337, 0.6122285187374188 ]
epoch 19 loss 2.0740886848722444 accuracy 0.44949632453035665
```

`LMModel2_2`: Embedding using uniform `[-2,2)` init
```
[ -0.02280506165625058, 1.1528261720333792 ]
[ -0.04128903245762719, 1.1474591470160227 ]
epoch 19 loss 1.8707648806156076 accuracy 0.5031309556221073
```

Xavier init would use `0.3216` (`Math.sqrt(6/(vocab.length+28))`) for uniform init.

# Maintaining the State of an RNN

Organise data so the model sees contiguous text over subsequent batches

```
def group_chunks(ds, bs):
    m = len(ds) // bs
    new_ds = L()
    for i in range(m): new_ds += L(ds[i + m*j] for j in range(bs))
    return new_ds
    ```

In [18]:
function groupChunks(ds,bs=64) {
    const m = Math.floor(ds[0].length/bs);
    const newDs = [...Array(ds.length).keys()].map(i=>[]);
    for (let i=0; i<m; i++) {
        for (let j=0; j<bs; j++) {
            for (let k=0; k<ds.length; k++) {
                newDs[k].push(ds[k][i + m*j]);
            }
        }
    }
    return newDs;
}

In [19]:
let _dataTemp = toData(nums,3);
let _data=groupChunks(_dataTemp);

console.log(shape(_data));
let _batches=batches(_data,64,true,false);
for (let b=0; b<3; b++) {
    let _batch=_batches[b];
    [...Array(5).keys()].forEach(i => {
        console.log(b,i,'x',vocab[_batch[0][i]],vocab[_batch[1][i]],vocab[_batch[2][i]],'y',vocab[_batch[3][i]]);
    });
}

[ 4, 3648 ]
0 0 x one . two y .
0 1 x sixty six . y sixty
0 2 x hundred eighteen . y one
0 3 x three . one y hundred
0 4 x eighty eight . y one
1 0 x . three . y four
1 1 x sixty seven . y sixty
1 2 x one hundred nineteen y .
1 3 x hundred fifty four y .
1 4 x one hundred eighty y nine
2 0 x four . five y .
2 1 x sixty eight . y sixty
2 2 x . one hundred y twenty
2 3 x . one hundred y fifty
2 4 x nine . one y hundred


here's how we could move the initialization of `h` out of `forward` &darr;

In [20]:
class LMModel3 extends LMModel2 {
    constructor(vocab_sz, n_hidden, sequenceLength) {
        super(vocab_sz, n_hidden, sequenceLength);
        this.h=0;
    }
    
    forward(x) {
        for (let i=0; i<3; i++) {
            this.h = matrixSum2d(this.i_h.forward(this.toOneHot(x[i])), this.h);
            this.h = this.non_linear.forward(this.h_h.forward(this.h));
        }
        return this.h_o.forward(this.h);
    }
        
    reset() {
        this.h=0;
    }
}

without organising the data so the model sees contiguous text over subsequent batches, we loose a little accuracy (~2%) but i thought we'd loose more. Maybe if we were looking at validation accuracy we'd see more of a difference.

organising the data with `groupChunks` improves ~2% over our fist model accuracy.

In [21]:
train(LMModel3,groupChunks(toData(nums,3)),true,false);

Training model [class LMModel3 extends LMModel2]
epoch 0 2021-05-14T13:20:03.222Z train loss 2.955 accuracy 0.331
epoch 1 2021-05-14T13:20:09.458Z train loss 2.626 accuracy 0.377
epoch 2 2021-05-14T13:20:15.325Z train loss 2.423 accuracy 0.404
epoch 3 2021-05-14T13:20:21.361Z train loss 2.237 accuracy 0.431
epoch 4 2021-05-14T13:20:26.932Z train loss 2.09 accuracy 0.451
epoch 5 2021-05-14T13:20:32.565Z train loss 1.983 accuracy 0.467
epoch 6 2021-05-14T13:20:38.350Z train loss 1.902 accuracy 0.48
epoch 7 2021-05-14T13:20:43.982Z train loss 1.836 accuracy 0.49
epoch 8 2021-05-14T13:20:49.551Z train loss 1.782 accuracy 0.498
epoch 9 2021-05-14T13:20:55.204Z train loss 1.736 accuracy 0.505
epoch 10 2021-05-14T13:21:00.902Z train loss 1.698 accuracy 0.512
epoch 11 2021-05-14T13:21:06.548Z train loss 1.666 accuracy 0.517
epoch 12 2021-05-14T13:21:12.463Z train loss 1.639 accuracy 0.522
epoch 13 2021-05-14T13:21:18.671Z train loss 1.614 accuracy 0.525
epoch 14 2021-05-14T13:21:24.640Z train 

# Creating More Signal

## This one is a work in progress

I can't find a way to train this model to anything like the accuracy of the previous models - loss reduces for a few epochs, but then starts increasing and usually goes to infinity unless we stop training early.

Here's some things I've tried;
- normalizing gradients going in to `this.h_o.backward`
    - seems strange that the model can train at all when we do this - it must be changing the sign of some gradients when we center to mean 0?
- reset `h` at the start of forward
    - with and without shuffling data in batches
- different batch sizes
    - up to putting all of the data into a single batch
- different learning rates
    - even `lr`s that look too slow to get anywhere near good accuracy can cause loss -> infinity
- going backward over fewer timesteps than forward
- different sequence lengths: 3, 4, 8, 16
    - shorter sequences train faster ...
- different model size (`n_hidden`): 28, 32, 48, 52, 64
    - smaller models train faster ...
- swapping `i_h` `Embedding` for `Linear`

I thought that maybe this model might just not train with plain SDG? but ...

```
learn = Learner(dls, LMModel4(len(vocab), 64), loss_func=loss_func,
                metrics=accuracy, cbs=ModelResetter, 
                opt_func=partial(SGD,mom=0.0, wd=0.0, decouple_wd=False),
                moms=(0,0,0),
                wd=0)
learn.fit(15, 3e-3)
```

| epoch | train_loss | valid_loss | accuracy | time  |
|-------|------------|------------|----------|-------|
| 0     | 3.349324   | 3.404294   | 0.087321 | 00:01 |
| 1     | 3.264445   | 3.3102     | 0.157145 | 00:01 |
| 2     | 3.160242   | 3.204797   | 0.226725 | 00:01 |
| 3     | 3.034063   | 3.08055    | 0.234945 | 00:01 |
| 4     | 2.889665   | 2.945293   | 0.308105 | 00:01 |
| 5     | 2.744815   | 2.818976   | 0.310954 | 00:01 |
| 6     | 2.608936   | 2.700353   | 0.337158 | 00:01 |
| 7     | 2.480137   | 2.585766   | 0.367106 | 00:01 |
| 8     | 2.359796   | 2.476585   | 0.390544 | 00:01 |
| 9     | 2.250405   | 2.370503   | 0.418294 | 00:01 |
| 10    | 2.151468   | 2.266698   | 0.434326 | 00:01 |
| 11    | 2.062078   | 2.170193   | 0.440999 | 00:01 |
| 12    | 1.982661   | 2.08777    | 0.450602 | 00:01 |
| 13    | 1.914207   | 2.023855   | 0.455648 | 00:01 |
| 14    | 1.856968   | 1.977419   | 0.458089 | 00:01 |

Must be a mistake in here somewhere ... 

In [22]:
class LMModel4 {
    constructor(vocab_sz, n_hidden) {
        this.useEmbedding=true;
        if (this.useEmbedding) {
            const embedding=new Embedding(vocab_sz, n_hidden);
            embedding.weights=uniform(vocab_sz, n_hidden,-2,2);
            this.i_h = new MultiCallLayer(embedding);
        } else {
            this.i_h = new MultiCallLayer(new Linear(vocab_sz, n_hidden));
        }
        this.h_h = new MultiCallLayer(new Linear(n_hidden, n_hidden));
        this.h_o = new MultiCallLayer(new Linear(n_hidden, vocab_sz));
        this.non_linear = new ReLU();
        this.oneHotLookup = normalize(identity(vocab_sz));
        this.h=0;
    }
    
    toOneHot(x) {
        return x.map(e=>this.oneHotLookup[e]);
    }
    
    forward(x) {
        const result = [];
        for (let i=0; i<x.length-1; i++) {
            if (this.useEmbedding) {
                this.h = matrixSum2d(this.i_h.forward(x[i]), this.h);
            } else {
                this.h = matrixSum2d(this.i_h.forward(this.toOneHot(x[i])), this.h);
            }
            this.h = this.non_linear.forward(this.h_h.forward(this.h));
            result.push(this.h_o.forward(this.h));
        }
        return result;
    }
    
    backward(gradients) {
        for (let i=gradients.length-1; i>=0; i--) {
            let g=this.h_o.backward(gradients[i]);
            g=this.non_linear.backward(g);
            g=this.h_h.backward(g);
            // TODO: matrix sum
            this.i_h.backward(g);
        }
    }
    
    update(lr) {
        this.i_h.update(lr);
        this.h_h.update(lr);
        this.h_o.update(lr);
    }
    
    reset() {
        this.h=0;
    }
}

In [23]:
class Flatten {
    forward(x) {
        this.originalShape=shape(x);
        return [].concat(...x);
    }
    backward(x) {
        const result=[];
        for (let i=0; i<this.originalShape[0]; i++) {
            const startFrom=i*this.originalShape[1];
            result.push(x.slice(startFrom,startFrom+this.originalShape[1]));
        }
        return result;
    }
}

The following test shows that we can use the same flatten for multiple arrays, as long as the first 2 dimentions of the arrays are the same.

In [24]:
let a=[],b=[];
for (let i=0; i<3; i++) {
    a.push([]); b.push([]);
    for (let j=0; j<2; j++) {
        a[a.length-1].push(`${i}.${j}`);
        let _b=[]; b[b.length-1].push(_b);
        for (let k=0; k<4; k++) {
            _b.push(`${i}.${j}.${k}`);
        }
    }
}
let _a=JSON.stringify(a),_b=JSON.stringify(b); 
let flatten=new Flatten();
let aFlat=flatten.forward(a);
let bFlat=flatten.forward(b);
testEq([6],shape(aFlat));
testEq([6,4],shape(bFlat));
let aUnflat=flatten.backward(aFlat);
let bUnflat=flatten.backward(bFlat);
testEq([3,2],shape(aUnflat));
testEq([3,2,4],shape(bUnflat));
testEq(_a,JSON.stringify(aUnflat));
testEq(_b,JSON.stringify(bUnflat));

In [None]:
function train2(model_fn,data,dropLastBatch=false,shuffleBatch=true) {
    console.log('Training model',model_fn);
    let lossFn=new CrossEntropyLoss();
    let model=new model_fn(vocab.length,64,data.length-1); // data.length-1 is sequence length
    let flatten=new Flatten();
    let yTrue=data[model.sequenceLength];
    let lossValues=[];
    let accuracyValues=[];
    for (let epoch=0; epoch<50; epoch++) {
        batches(data,64,dropLastBatch,shuffleBatch).forEach(batch => {
            const xb=batch; // the model will look at only sequence length tokens
            const ybFlat=flatten.forward(batch.filter((e,i)=>i!=0));
            let predsFlat=flatten.forward(model.forward(xb));
            let lossValue=lossFn.forward(predsFlat,ybFlat);
            if (lossValue==Infinity) {
                throw 'lossValue==Infinity';
            }
            lossValues.push(lossValue);
            accuracyValues.push(accuracy(predsFlat,ybFlat));
            model.backward(flatten.backward(lossFn.backward()));
            model.update(3e-3);
        });
        console.log('epoch',epoch,new Date(),'train loss',round(mean(lossValues),3),
                    'accuracy',round(mean(accuracyValues),3));
        if (model.reset) {
            model.reset();
        }
    }
}

In [None]:
train2(LMModel4,groupChunks(toData(nums,16)),true,false);

Training model [class LMModel4]
epoch 0 2021-05-14T13:21:44.731Z train loss 5.105 accuracy 0.069
epoch 1 2021-05-14T13:22:06.057Z train loss 4.734 accuracy 0.081
epoch 2 2021-05-14T13:22:27.579Z train loss 4.554 accuracy 0.09
epoch 3 2021-05-14T13:22:48.569Z train loss 4.443 accuracy 0.097
epoch 4 2021-05-14T13:23:08.469Z train loss 4.366 accuracy 0.102
epoch 5 2021-05-14T13:23:26.842Z train loss 4.311 accuracy 0.107
epoch 6 2021-05-14T13:23:45.961Z train loss 4.269 accuracy 0.111
epoch 7 2021-05-14T13:24:03.850Z train loss 4.237 accuracy 0.115
epoch 8 2021-05-14T13:24:22.328Z train loss 4.214 accuracy 0.118
epoch 9 2021-05-14T13:24:44.053Z train loss 4.195 accuracy 0.121
epoch 10 2021-05-14T13:25:02.383Z train loss 4.182 accuracy 0.124
epoch 11 2021-05-14T13:25:20.162Z train loss 4.175 accuracy 0.128
epoch 12 2021-05-14T13:25:38.015Z train loss 4.173 accuracy 0.131
epoch 13 2021-05-14T13:25:56.546Z train loss 4.176 accuracy 0.134
epoch 14 2021-05-14T13:26:14.452Z train loss 4.183 accu