# RNN Encoder-decoder 

use fastai human numbers data to train. First a classifier then do decoder.

The data is from [fastai book chap 12](https://github.com/fastai/fastbook/blob/master/12_nlp_dive.ipynb). Looks like:

```
one 
two 
three 
...
two hundred seven 
two hundred eight 
...
```

In [1]:
from fastai2.text.all import *
path = untar_data(URLs.HUMAN_NUMBERS)
path

Path('/Users/parrt/.fastai/data/human_numbers')

## Support

In [2]:
import codecs
import os
import re
import string
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
#from torch.nn.functional import softmax
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

dtype = torch.float
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [3]:
def get_text(filename:str):
    """
    Load and return the text of a text file, assuming latin-1 encoding as that
    is what the BBC corpus uses.  Use codecs.open() function not open().
    """
    f = codecs.open(filename, encoding='latin-1', mode='r')
    s = f.read()
    f.close()
    return s

In [4]:
def getvocab(strings):
    letters = [list(l) for l in strings]
    vocab = set([c for cl in letters for c in cl])
    vocab = sorted(list(vocab))
    ctoi = {c:i for i, c in enumerate(vocab)}
    return vocab, ctoi

In [5]:
def softmax(y):
    expy = torch.exp(y)
    if len(y.shape)==1: # 1D case can't use axis arg
        return expy / torch.sum(expy)
    return expy / torch.sum(expy, axis=1).reshape(-1,1)

In [6]:
def get_max_len(X):
    max_len = 0
    for x in X:
        max_len = max(max_len, len(x))
    return max_len

## Load

In [7]:
text = get_text(path/'train.txt').strip()
print(text[:28])
lines = text.lower().split('\n')
print(lines[:5])

one 
two 
three 
four 
five 
['one ', 'two ', 'three ', 'four ', 'five ']


In [8]:
# get unique vocab but don't sort; keep order so 'one'=1 etc...
# use '#' to indicate padded (unused) char for embedding purposes
v = set('#')
X_vocab = ['#']
for t in text.split():
    if t not in v:
        X_vocab.append(t)
        v.add(t)
X_vocab[:10]

['#', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']

In [9]:
X_tokens = [line.strip().split(' ') for line in lines]
X_tokens[18:23]

[['nineteen'],
 ['twenty'],
 ['twenty', 'one'],
 ['twenty', 'two'],
 ['twenty', 'three']]

In [10]:
wtoi = {w:i for i,w in enumerate(X_vocab)}
wtoi['#'], wtoi['one'], wtoi['two']

(0, 1, 2)

In [11]:
max_len = get_max_len(X_tokens)
X = torch.zeros(len(X_tokens),max_len, dtype=torch.long) # zero implies padding
print(X.shape)
for i in range(len(X_tokens)):
    x = X_tokens[i]
    pad = max_len - len(x)
    for j in range(len(x)):
        X[i,j+pad] = wtoi[X_tokens[i][j]]
X

torch.Size([7999, 6])


tensor([[ 0,  0,  0,  0,  0,  1],
        [ 0,  0,  0,  0,  0,  2],
        [ 0,  0,  0,  0,  0,  3],
        ...,
        [ 7, 29,  9, 28, 27,  7],
        [ 7, 29,  9, 28, 27,  8],
        [ 7, 29,  9, 28, 27,  9]])

## Classifier

### Create y target class vector

y  is just 1..len(X_tokens)

In [12]:
y = torch.tensor(range(1,len(X_tokens)+1))
y

tensor([   1,    2,    3,  ..., 7997, 7998, 7999])

In [13]:
X_tokens[0], X_tokens[-1]

(['one'], ['seven', 'thousand', 'nine', 'hundred', 'ninety', 'nine'])

### Split out validation set

In [14]:
ntrain = int(len(X)*.80)
X_train, y_train = X[:ntrain], y[:ntrain]
X_valid, y_valid = X[ntrain:], y[ntrain:]

In [24]:
n = len(X_train)

nhidden = 128
batch_size = 32
embed_sz = 10
nbatches = n // batch_size
n = nbatches * batch_size
X_train = X_train[0:n]
y_train = y_train[0:n]
nfeatures = len(X_vocab)
nclasses = len(X_tokens) # they are unique targets

print(f"{n:,d} training records, batch size {batch_size}, {nfeatures} features (words), {nclasses} target classes, state is {nhidden}-vector")

6,368 training records, batch size 32, 30 features (words), 7999 target classes, state is 128-vector


### Train

In [25]:
#%%time 
#torch.manual_seed(0) # SET SEED FOR TESTING
E = torch.randn(embed_sz,      len(X_vocab),  device=device, dtype=torch.float64, requires_grad=True) # embedding
W = torch.eye(nhidden,         nhidden,       device=device, dtype=torch.float64, requires_grad=True)
U = torch.randn(nhidden,       embed_sz,      device=device, dtype=torch.float64, requires_grad=True) # input converter
#B = torch.zeros(nhidden,       nchunks,       device=device, dtype=torch.float64, requires_grad=True)
V = torch.randn(nclasses,      nhidden,       device=device, dtype=torch.float64, requires_grad=True) # take RNN output (h) and predict target

with torch.no_grad():
    E[:,0] = 0.0  # padding word gives 0 vector

optimizer = torch.optim.Adam([E,W,U,V], lr=0.001, weight_decay=0.0)

history = []
epochs = 12
for epoch in range(1, epochs+1):
#     print(f"EPOCH {epoch}")
    epoch_training_loss = 0.0
    epoch_training_accur = 0.0
    total = 0
    for p in range(0, n, batch_size):  # do one epoch
        loss = 0
        batch_X = X_train[p:p+batch_size]
        batch_y = y_train[p:p+batch_size]
        H = torch.zeros(nhidden, batch_size, dtype=torch.float64, requires_grad=False)
        for t in range(max_len):
            x_step_t = batch_X[:,t]
            # column E[i] is the embedding for char index i. same as multiple E.mm(onehot(i))
            embedding_step_t = E[:,x_step_t]
            H = W.mm(H) + U.mm(embedding_step_t)
            H = torch.tanh(H)
        o = V.mm(H)
        o = o.T # make it batch_size x nclasses
        p = softmax(o)
        correct = torch.argmax(p, dim=1)==batch_y
        epoch_training_accur += torch.sum(correct)

        loss = F.cross_entropy(o, batch_y)
#         print(loss.item())
        total += len(batch_y)

        # update matrices based upon loss computed from a batch
        optimizer.zero_grad()
        loss.backward() # autograd computes U.grad, M.grad, ...
        optimizer.step()

        epoch_training_loss += loss.detach().item()

    epoch_training_loss /= nbatches
    epoch_training_accur /= n
    print(f"Epoch {epoch:3d} training loss {epoch_training_loss:7.4f} accur {epoch_training_accur:7.4f}")

#     with torch.no_grad():
#         o = forward(X_train, max_len, vocab)#, apply_softmax=False)
#         train_loss = cross_entropy(o, y_train).item()
#         correct = torch.argmax(o, dim=1).detach()==torch.tensor(y_train)
#         train_accur = torch.sum(correct) / float(len(X_train))

#         o = forward(X_valid, max_len, vocab)
#         valid_loss = cross_entropy(o, y_valid).item()
#         correct = torch.argmax(o, dim=1).detach()==torch.tensor(y_valid)
#         valid_accur = torch.sum(correct) / float(len(X_valid))

#         history.append((train_loss, valid_loss))
#         print(f"Epoch: {epoch:3d} accum loss {epoch_training_loss:7.4f} accur {epoch_training_accur:4.3f} | train loss {train_loss:7.4f} accur {train_accur:4.3f} | valid loss {valid_loss:7.4f} accur {valid_accur:4.3f}")

Epoch   1 training loss 29.7416 accur  0.0002
Epoch   2 training loss 22.2778 accur  0.0002
Epoch   3 training loss 18.6086 accur  0.0002


KeyboardInterrupt: 

In [None]:
X_train[:,0]

## Translation

### Define y sequence of digits

So `'one' -> '1'`, `['twenty', 'three'] -> ['2','3']`, etc...

In [None]:
Y_vocab = {d:i for i,d in enumerate("0123456789")}
Y_vocab

In [None]:
Y = []
for i in range(0,len(X)):
    Y.append([int(d) for d in str(i+1)])
Y[:12]