In [1]:
//default_exp rnn

# rnn

> Implement some of the language models in https://github.com/fastai/fastbook/blob/master/12_nlp_dive.ipynb.

## Set-up data used in tests / demos
    
Using [human_numbers.tgz](https://s3.amazonaws.com/fast-ai-sample/human_numbers.tgz), I created an even smaller dataset with the following;

```
lines = []
with open('data/human_numbers/train.txt') as f: lines.extend(f.readlines())
with open('data/human_numbers/valid.txt') as f: lines.extend(f.readlines())
with open('data/human_numbers/train_and_valid.txt','w') as f:
    f.write(' . '.join([l.strip() for l in lines[:2000]]))
```

In [1]:
const text=require('fs').readFileSync('data/human_numbers/train_and_valid.txt').toString()
text.substring(0,100);

one . two . three . four . five . six . seven . eight . nine . ten . eleven . twelve . thirteen . fo


In [2]:
const tokens=text.split(' ')
const vocab = tokens.filter((item, i, ar) => ar.indexOf(item) === i);
vocab

[
  'one',      '.',        'two',
  'three',    'four',     'five',
  'six',      'seven',    'eight',
  'nine',     'ten',      'eleven',
  'twelve',   'thirteen', 'fourteen',
  'fifteen',  'sixteen',  'seventeen',
  'eighteen', 'nineteen', 'twenty',
  'thirty',   'forty',    'fifty',
  'sixty',    'seventy',  'eighty',
  'ninety',   'hundred',  'thousand'
]


In [3]:
const word2idx={};
vocab.forEach((word,idx)=>word2idx[word]=idx);
word2idx

{
  one: 0,
  '.': 1,
  two: 2,
  three: 3,
  four: 4,
  five: 5,
  six: 6,
  seven: 7,
  eight: 8,
  nine: 9,
  ten: 10,
  eleven: 11,
  twelve: 12,
  thirteen: 13,
  fourteen: 14,
  fifteen: 15,
  sixteen: 16,
  seventeen: 17,
  eighteen: 18,
  nineteen: 19,
  twenty: 20,
  thirty: 21,
  forty: 22,
  fifty: 23,
  sixty: 24,
  seventy: 25,
  eighty: 26,
  ninety: 27,
  hundred: 28,
  thousand: 29
}


In [4]:
const nums=tokens.map(e=>word2idx[e]);
nums

[
   0,  1,  2,  1,  3, 1,  4,  1,  5,  1,  6, 1,
   7,  1,  8,  1,  9, 1, 10,  1, 11,  1, 12, 1,
  13,  1, 14,  1, 15, 1, 16,  1, 17,  1, 18, 1,
  19,  1, 20,  1, 20, 0,  1, 20,  2,  1, 20, 3,
   1, 20,  4,  1, 20, 5,  1, 20,  6,  1, 20, 7,
   1, 20,  8,  1, 20, 9,  1, 21,  1, 21,  0, 1,
  21,  2,  1, 21,  3, 1, 21,  4,  1, 21,  5, 1,
  21,  6,  1, 21,  7, 1, 21,  8,  1, 21,  9, 1,
  22,  1, 22,  0,
  ... 10921 more items
]


In [5]:
const data=[[],[],[],[]];
for (let i=0; i<nums.length-4; i+=3) {
    data[0].push(nums[i]);
    data[1].push(nums[i+1]);
    data[2].push(nums[i+2]);
    data[3].push(nums[i+3]);
}

In [6]:
const tokenCounter={};
vocab.forEach(token=>tokenCounter[token]=0);
tokens.forEach(token=>tokenCounter[token]++);
let mostCommonToken=[null,0];
for (let key in tokenCounter) {
    if (tokenCounter[key]>mostCommonToken[1]) {
        mostCommonToken=[key,tokenCounter[key]];
    }
}
console.log('mostCommonToken', mostCommonToken, mostCommonToken[1]/tokens.length);

mostCommonToken [ '.', 1999 ] 0.18138099990926412


&uarr; if we can get better than 0.18 accuracy, our model will be doing better than predicting the most common token

In [7]:
/**
Imports we need in rnn.module.js
*/
import {round,flatten,exp,shape,transpose,dotProduct,randn,full,zeros,mean,reshape,argmax} from './src/util.module.js';
import {normalize,identity,meanAndStandardDeviation} from './src/util.module.js';
import {matrixSum1d,matrixSum2d,matrixSubtract1d,matrixSubtract2d,matrixMultiply1d,matrixMultiply2d} from './src/util.module.js';
import {head,tail,parseCsv,IRIS_CLASS_MAP,IrisRowHandler,shuffle,split,batches} from './src/data.module.js';
import {accuracy,Sigmoid,MSE,ReLU,Linear,Learner} from './src/nn.module.js';
import {BinaryCrossEntropyLoss,CrossEntropyLoss} from './src/nn.module.js';

In [8]:
/**
A Linear that can be called multiple times during a forward pass.
*/
class MultiCallLinear extends Linear{
    constructor(inputDim,numHidden=1,bias=true) {
        super(inputDim,numHidden,bias);
        this.xHistory=[];
        this.weightsGradients=null;
        this.biasGradients=null;
    }
    forward(x) {
        this.xHistory.push(x);
        return super.forward(x);
    }
    backward(gradient) {
        if (this.xHistory.length == 0) {
            throw `this.xHistory is empty`;
        }
        this.x=this.xHistory.pop();
        super.backward(gradient);
        this.weightsGradients=(this.weightsGradients == null) 
                ? this.weightsGradient
                : matrixSum2d(this.weightsGradient,this.weightsGradients);
        this.biasGradients=(this.biasGradients == null) 
                ? this.biasGradient
                : matrixSum1d(this.biasGradient,this.biasGradients);
        // Note: we're not keeping x gradients
        return this.xGradient;
    }
    update(lr) {
        if (this.xHistory.length != 0) {
            throw `forward has been called ${this.xHistory.length} times more than backward`;
        }
        super.weightsGradient=this.weightsGradients;
        super.biasGradient=this.biasGradients;
        super.update(lr);
        this.weightsGradients=null;
        this.biasGradients=null;
    }
}

## TODO: test MultiCallLinear &uarr;

## First recurrent model - predict next word from previous 3 words

In [9]:
class LMModel2 {
    constructor(vocab_sz, n_hidden) {
        this.i_h = new MultiCallLinear(vocab_sz, n_hidden);
        this.h_h = new MultiCallLinear(n_hidden, n_hidden);
        this.h_o = new Linear(n_hidden, vocab_sz);
        this.non_linear = new ReLU();
        this.oneHotLookup = normalize(identity(vocab_sz));
    }
    
    toOneHot(x) {
        return x.map(e=>this.oneHotLookup[e]);
    }
    
    forward(x) {
        let h=0;
        for (let i=0; i<3; i++) {
            h = matrixSum2d(this.i_h.forward(this.toOneHot(x[i])), h);
            h = this.non_linear.forward(this.h_h.forward(h));
        }
        return this.h_o.forward(h);
    }
    
    backward(gradients) {
        let g=this.h_o.backward(gradients);
        for (let i=2; i>=0; i--) {
            g=this.non_linear.backward(g);
            g=this.h_h.backward(g);
            // TODO: matrix sum
            this.i_h.backward(g);
        }
    }
    
    update(lr) {
        this.i_h.update(lr);
        this.h_h.update(lr);
        this.h_o.update(lr);
    }
}

`constructor`
- normalizing the one-hot lookup seems to make training more stable and slightly more accurate

`forward`
- If `i_h` was an embedding we wouldn't need `toOneHot`
- When we `matrixSum2d(i_h.forward(), h)` we put `h` on the right so that the initial value `0` is re-shaped to match the shape of the output of `i_h` (which is `[n_hidden,n_hidden]`)



In [10]:
let lossFn=new CrossEntropyLoss();
let model=new LMModel2(vocab.length,28);
let yTrue=data[3];
for (let epoch=0; epoch<20; epoch++) {
    let yPred=model.forward(data);
    let lossValue=lossFn.forward(yPred,yTrue);
    console.log('epoch',epoch,'loss',lossValue,'accuracy',accuracy(yPred,yTrue));
    model.backward(lossFn.backward());
    model.update(3e-1);
}

epoch 0 loss 9.463804802334936 accuracy 0.021780560849441872
epoch 1 loss 4.423252671947863 accuracy 0.11952082766131228
epoch 2 loss 2.952456813105928 accuracy 0.22951265995099374
epoch 3 loss 2.5801670238430106 accuracy 0.29267628641437515
epoch 4 loss 2.3919609654912244 accuracy 0.39940103457664033
epoch 5 loss 2.249359315894222 accuracy 0.4252654505853526
epoch 6 loss 2.160554415187405 accuracy 0.45657500680642527
epoch 7 loss 2.091707674548205 accuracy 0.4707323713585625
epoch 8 loss 2.041981511649167 accuracy 0.4778110536346311
epoch 9 loss 1.997407597559624 accuracy 0.4756329975496869
epoch 10 loss 1.9642968522151296 accuracy 0.4938742172610945
epoch 11 loss 1.9282146869101724 accuracy 0.4848897359106997
epoch 12 loss 1.9013480879692413 accuracy 0.5028586986114892
epoch 13 loss 1.8767241613264303 accuracy 0.476449768581541
epoch 14 loss 1.8811034015094106 accuracy 0.4857065069425538
epoch 15 loss 1.8723834145306242 accuracy 0.465287231146202
epoch 16 loss 1.9324484061780247 accu

```
let lossFn=new CrossEntropyLoss();
let model=new LMModel2(vocab.length,28);
let yTrue=data[3];
for (let epoch=0; epoch<10; epoch++) {
    let yPred=model.forward(data);
    let lossValue=lossFn.forward(yPred,yTrue);
    console.log('epoch',epoch,'loss',lossValue,'accuracy',accuracy(yPred,yTrue));
    model.backward(lossFn.backward());
    model.update(3e-1);
}
epoch 0 loss 3.2278166493704066 accuracy 0.07296487884563027
...
epoch 9 loss 2.3760766295703397 accuracy 0.43642798802069155
```

### What if we normalize the signal/grads coming back from loss?

`gradients` is used in a `dotProduct` to calculate weights and x gradients or each `Linear`-  might we get better training stability by normalizing the gradients we get back from the loss function?

Adding the following to the top of `backward` shows us the mean and standard deviation of `gradients` and then normalizes them.

```
        let stats=meanAndStandardDeviation(flatten(gradients))
        console.log('grads from loss function [mean,std]',stats);
        gradients=normalize(gradients);
```

This increases the standard deviation from ~0.2 to 1.0 so we reduce `lr` by about 5x - which trains pretty much like it did before (o:

Maybe this will help more when we try longer sequence lengths?