In [1]:
//default_exp rnn

# rnn

> Implement some of the language models in https://github.com/fastai/fastbook/blob/master/12_nlp_dive.ipynb.

## Set-up data used in tests / demos
    
Using [human_numbers.tgz](https://s3.amazonaws.com/fast-ai-sample/human_numbers.tgz), I created an even smaller dataset with the following;

```
lines = []
with open('data/human_numbers/train.txt') as f: lines.extend(f.readlines())
with open('data/human_numbers/valid.txt') as f: lines.extend(f.readlines())
with open('data/human_numbers/train_and_valid.txt','w') as f:
    f.write(' . '.join([l.strip() for l in lines[:2000]]))
```

In [1]:
const text=require('fs').readFileSync('data/human_numbers/train_and_valid.txt').toString();
text.substring(0,100);

one . two . three . four . five . six . seven . eight . nine . ten . eleven . twelve . thirteen . fo


In [4]:
const tokens=text.split(' ');
const vocab = [...new Set(tokens)];
vocab

[
  'one',      '.',        'two',
  'three',    'four',     'five',
  'six',      'seven',    'eight',
  'nine',     'ten',      'eleven',
  'twelve',   'thirteen', 'fourteen',
  'fifteen',  'sixteen',  'seventeen',
  'eighteen', 'nineteen', 'twenty',
  'thirty',   'forty',    'fifty',
  'sixty',    'seventy',  'eighty',
  'ninety',   'hundred',  'thousand'
]


In [4]:
const word2idx={};
vocab.forEach((word,idx)=>word2idx[word]=idx);
word2idx

{
  one: 0,
  '.': 1,
  two: 2,
  three: 3,
  four: 4,
  five: 5,
  six: 6,
  seven: 7,
  eight: 8,
  nine: 9,
  ten: 10,
  eleven: 11,
  twelve: 12,
  thirteen: 13,
  fourteen: 14,
  fifteen: 15,
  sixteen: 16,
  seventeen: 17,
  eighteen: 18,
  nineteen: 19,
  twenty: 20,
  thirty: 21,
  forty: 22,
  fifty: 23,
  sixty: 24,
  seventy: 25,
  eighty: 26,
  ninety: 27,
  hundred: 28,
  thousand: 29
}


In [5]:
const nums=tokens.map(e=>word2idx[e]);
nums

[
   0,  1,  2,  1,  3, 1,  4,  1,  5,  1,  6, 1,
   7,  1,  8,  1,  9, 1, 10,  1, 11,  1, 12, 1,
  13,  1, 14,  1, 15, 1, 16,  1, 17,  1, 18, 1,
  19,  1, 20,  1, 20, 0,  1, 20,  2,  1, 20, 3,
   1, 20,  4,  1, 20, 5,  1, 20,  6,  1, 20, 7,
   1, 20,  8,  1, 20, 9,  1, 21,  1, 21,  0, 1,
  21,  2,  1, 21,  3, 1, 21,  4,  1, 21,  5, 1,
  21,  6,  1, 21,  7, 1, 21,  8,  1, 21,  9, 1,
  22,  1, 22,  0,
  ... 10921 more items
]


In [6]:
const data=[[],[],[],[]];
for (let i=0; i<nums.length-4; i+=3) {
    data[0].push(nums[i]);
    data[1].push(nums[i+1]);
    data[2].push(nums[i+2]);
    data[3].push(nums[i+3]);
}

In [7]:
const tokenCounter={};
vocab.forEach(token=>tokenCounter[token]=0);
tokens.forEach(token=>tokenCounter[token]++);
let mostCommonToken=[null,0];
for (let key in tokenCounter) {
    if (tokenCounter[key]>mostCommonToken[1]) {
        mostCommonToken=[key,tokenCounter[key]];
    }
}
console.log('mostCommonToken', mostCommonToken, mostCommonToken[1]/tokens.length);

mostCommonToken [ '.', 1999 ] 0.18138099990926412


&uarr; if we can get better than 0.18 accuracy, our model will be doing better than predicting the most common token

In [8]:
/**
Imports we need in rnn.module.js
*/
import {round,flatten,exp,shape,transpose,dotProduct,randn,uniform,full,zeros} from './src/util.module.js';
import {mean,reshape,argmax,normalize,identity,meanAndStandardDeviation} from './src/util.module.js';
import {matrixSum1d,matrixSum2d,matrixSubtract1d,matrixSubtract2d,matrixMultiply1d,matrixMultiply2d} from './src/util.module.js';
import {head,tail,parseCsv,IRIS_CLASS_MAP,IrisRowHandler,shuffle,split,batches} from './src/data.module.js';
import {accuracy,Sigmoid,MSE,ReLU,Linear,Embedding,Learner} from './src/nn.module.js';
import {BinaryCrossEntropyLoss,CrossEntropyLoss} from './src/nn.module.js';

In [9]:
/**
A layer that can wrap a Linear or Embedding so that it can be called multiple times during a forward pass.
*/
class MultiCallLayer {
    constructor(layer) {
        this.layer=layer;
        this.xHistory=[];
        this.weightsGradients=null;
        this.biasGradients=null;
    }
    forward(x) {
        this.xHistory.push(x);
        return this.layer.forward(x);
    }
    _matrixSum2d(a,b) {
        return (b == null) ? a : matrixSum2d(a,b);
    }
    _matrixSum1d(a,b) {
        return (b == null) ? a : matrixSum1d(a,b);
    }
    backward(gradient) {
        if (this.xHistory.length == 0) {
            throw `this.xHistory is empty`;
        }
        this.x=this.xHistory.pop();
        this.layer.backward(gradient);
        this.weightsGradients=this._matrixSum2d(this.layer.weightsGradient,this.weightsGradients);
        this.biasGradients=this._matrixSum1d(this.layer.biasGradient,this.biasGradients);
        // Note: we're not keeping x gradients
        return this.layer.xGradient;
    }
    update(lr) {
        if (this.xHistory.length != 0) {
            throw `forward has been called ${this.xHistory.length} times more than backward`;
        }
        this.layer.weightsGradient=this.weightsGradients;
        this.layer.biasGradient=this.biasGradients;
        this.layer.update(lr);
        this.weightsGradients=null;
        this.biasGradients=null;
    }
}

## TODO: test MultiCallLinear &uarr;

## First recurrent model - predict next word from previous 3 words

In [10]:
class LMModel2 {
    constructor(vocab_sz, n_hidden) {
        this.i_h = new MultiCallLayer(new Linear(vocab_sz, n_hidden));
        this.h_h = new MultiCallLayer(new Linear(n_hidden, n_hidden));
        this.h_o = new Linear(n_hidden, vocab_sz);
        this.non_linear = new ReLU();
        this.oneHotLookup = normalize(identity(vocab_sz));
    }
    
    toOneHot(x) {
        return x.map(e=>this.oneHotLookup[e]);
    }
    
    forward(x) {
        let h=0;
        for (let i=0; i<3; i++) {
            h = matrixSum2d(this.i_h.forward(this.toOneHot(x[i])), h);
            h = this.non_linear.forward(this.h_h.forward(h));
        }
        return this.h_o.forward(h);
    }
    
    backward(gradients) {
        let g=this.h_o.backward(gradients);
        for (let i=2; i>=0; i--) {
            g=this.non_linear.backward(g);
            g=this.h_h.backward(g);
            // TODO: matrix sum
            this.i_h.backward(g);
        }
    }
    
    update(lr) {
        this.i_h.update(lr);
        this.h_h.update(lr);
        this.h_o.update(lr);
    }
}

`constructor`
- normalizing the one-hot lookup seems to make training more stable and slightly more accurate

`forward`
- If `i_h` was an embedding we wouldn't need `toOneHot`
- When we `matrixSum2d(i_h.forward(), h)` we put `h` on the right so that the initial value `0` is re-shaped to match the shape of the output of `i_h` (which is `[n_hidden,n_hidden]`)



In [11]:
let lossFn=new CrossEntropyLoss();
let model=new LMModel2(vocab.length,28);
let yTrue=data[3];
for (let epoch=0; epoch<20; epoch++) {
    let yPred=model.forward(data);
    let lossValue=lossFn.forward(yPred,yTrue);
    console.log('epoch',epoch,new Date(),'loss',lossValue,'accuracy',accuracy(yPred,yTrue));
    model.backward(lossFn.backward());
    model.update(3e-1);
}

epoch 0 2021-04-07T14:33:56.298Z loss 6.85558588702092 accuracy 0.03947726653961339
epoch 1 2021-04-07T14:33:57.221Z loss 3.706164158094287 accuracy 0.19330247753879662
epoch 2 2021-04-07T14:33:58.120Z loss 2.9473966477678295 accuracy 0.2627280152463926
epoch 3 2021-04-07T14:33:59.045Z loss 2.6045549711621745 accuracy 0.3797985298121427
epoch 4 2021-04-07T14:33:59.919Z loss 2.4233704445707223 accuracy 0.4173699972774299
epoch 5 2021-04-07T14:34:00.878Z loss 2.3182864822502993 accuracy 0.4372447590525456
epoch 6 2021-04-07T14:34:01.885Z loss 2.2319829917692404 accuracy 0.44949632453035665
epoch 7 2021-04-07T14:34:02.783Z loss 2.1619745274058455 accuracy 0.4628369180506398
epoch 8 2021-04-07T14:34:03.714Z loss 2.0989276292689296 accuracy 0.47753879662401305
epoch 9 2021-04-07T14:34:04.728Z loss 2.0444898877931754 accuracy 0.49278518921862235
epoch 10 2021-04-07T14:34:05.715Z loss 1.9953668202121395 accuracy 0.49686904437789275
epoch 11 2021-04-07T14:34:07.038Z loss 1.9510350818508193 acc

```
let lossFn=new CrossEntropyLoss();
let model=new LMModel2(vocab.length,28);
let yTrue=data[3];
for (let epoch=0; epoch<10; epoch++) {
    let yPred=model.forward(data);
    let lossValue=lossFn.forward(yPred,yTrue);
    console.log('epoch',epoch,'loss',lossValue,'accuracy',accuracy(yPred,yTrue));
    model.backward(lossFn.backward());
    model.update(3e-1);
}
epoch 0 loss 3.2278166493704066 accuracy 0.07296487884563027
...
epoch 9 loss 2.3760766295703397 accuracy 0.43642798802069155
```

### What if we normalize the signal/grads coming back from loss?

`gradients` is used in a `dotProduct` to calculate weights and x gradients or each `Linear`-  might we get better training stability by normalizing the gradients we get back from the loss function?

Adding the following to the top of `backward` shows us the mean and standard deviation of `gradients` and then normalizes them.

```
        let stats=meanAndStandardDeviation(flatten(gradients))
        console.log('grads from loss function [mean,std]',stats);
        gradients=normalize(gradients);
```

This increases the standard deviation from ~0.2 to 1.0 so we reduce `lr` by about 5x - which trains pretty much like it did before (o:

Maybe this will help more when we try longer sequence lengths?

## LMModel2 with an `Embedding`

Using an embedding for the input to hidden layer makes training ~25% faster but ... if we one-hot encoded x, rather than call `toOneHot` in `LMModel2#forward`, we might see less of a difference.

Initializing the weights of the embedding seems to make a big difference to how the model trains - ended up using the unusual uniform `[-2,2)` init to get similar loss/accuracy.

`LMModel2`: Baseline (not using Embedding)
```
[ 0.00399811878414716, 1.2616704376269678 ]                  <- [mean, stdev] of i_h output first timestep
[ -0.15787033906300632, 1.4662360727552772 ]                 <- [mean, stdev] of i_h output last timestep
epoch 19 loss 1.7658807424786231 accuracy 0.5088483528450858
```

`LMModel2_2`: Embedding using Kaiming init
```
[ -0.007255579503492108, 0.25066918887623985 ]
[ -0.013472155034471406, 0.30197370989884775 ]
epoch 19 loss 2.4118482003014603 accuracy 0.3294309828478083
```

`LMModel2_2`: Embedding using uniform `[0,1)` init
```
[ 0.5006462868294974, 0.2901404931519155 ]
[ 0.40933492469236854, 0.32056279613160615 ]
epoch 19 loss 2.29436828263747 accuracy 0.3476722025592159
```

`LMModel2_2`: Embedding using uniform `[-1,1)` init
```
[ -0.00029163015159714393, 0.5849167220165731 ]
[ -0.01253279522022337, 0.6122285187374188 ]
epoch 19 loss 2.0740886848722444 accuracy 0.44949632453035665
```

`LMModel2_2`: Embedding using uniform `[-2,2)` init
```
[ -0.02280506165625058, 1.1528261720333792 ]
[ -0.04128903245762719, 1.1474591470160227 ]
epoch 19 loss 1.8707648806156076 accuracy 0.5031309556221073
```

Xavier init would use `0.3216` (`Math.sqrt(6/(vocab.length+28))`) for uniform init.

In [16]:
class LMModel2_2 {
    constructor(vocab_sz, n_hidden) {
        const embedding=new Embedding(vocab_sz, n_hidden);
        embedding.weights=uniform(vocab_sz, n_hidden,-2,2);
        this.i_h = new MultiCallLayer(embedding);
        this.h_h = new MultiCallLayer(new Linear(n_hidden, n_hidden));
        this.h_o = new Linear(n_hidden, vocab_sz);
        this.non_linear = new ReLU();
    }
    
    forward(x) {
        let h=0;
        for (let i=0; i<3; i++) {
            /* Use this chunk (and comment the line below) to see stats of i_h output
            const _h=this.i_h.forward(x[i]);
            console.log(meanAndStandardDeviation(flatten(_h)));
            h = matrixSum2d(_h, h);
            */
            h = matrixSum2d(this.i_h.forward(x[i]), h);
            h = this.non_linear.forward(this.h_h.forward(h));
        }
        return this.h_o.forward(h);
    }
    
    backward(gradients) {
        let g=this.h_o.backward(gradients);
        for (let i=2; i>=0; i--) {
            g=this.non_linear.backward(g);
            g=this.h_h.backward(g);
            // TODO: matrix sum
            this.i_h.backward(g);
        }
    }
    
    update(lr) {
        this.i_h.update(lr);
        this.h_h.update(lr);
        this.h_o.update(lr);
    }
}

In [17]:
let lossFn=new CrossEntropyLoss();
let model=new LMModel2_2(vocab.length,28);
let yTrue=data[3];
for (let epoch=0; epoch<20; epoch++) {
    let yPred=model.forward(data);
    let lossValue=lossFn.forward(yPred,yTrue);
    console.log('epoch',epoch,new Date(),'loss',lossValue,'accuracy',accuracy(yPred,yTrue));
    model.backward(lossFn.backward());
    model.update(3e-1);
}

epoch 0 2021-04-07T14:37:00.302Z loss 7.313158874221775 accuracy 0.009801252382248844
epoch 1 2021-04-07T14:37:01.156Z loss 4.201322620685817 accuracy 0.04029403757146747
epoch 2 2021-04-07T14:37:02.117Z loss 3.5045546958712617 accuracy 0.08657772937653145
epoch 3 2021-04-07T14:37:03.115Z loss 3.1463271031454774 accuracy 0.14456847263817044
epoch 4 2021-04-07T14:37:04.031Z loss 2.9013633222434336 accuracy 0.23985842635447863
epoch 5 2021-04-07T14:37:04.960Z loss 2.7227818060725757 accuracy 0.3087394500408385
epoch 6 2021-04-07T14:37:05.822Z loss 2.5738498626440065 accuracy 0.36591342227062346
epoch 7 2021-04-07T14:37:06.681Z loss 2.452707705533566 accuracy 0.39912877756602233
epoch 8 2021-04-07T14:37:07.686Z loss 2.358470421570021 accuracy 0.4244486795534985
epoch 9 2021-04-07T14:37:08.621Z loss 2.284588957613364 accuracy 0.44432344132861423
epoch 10 2021-04-07T14:37:09.466Z loss 2.2242727992122493 accuracy 0.46202014701878574
epoch 11 2021-04-07T14:37:10.321Z loss 2.1725631749838397 a