# RNN Encoder-decoder 

use fastai human numbers data to train. First a classifier then do decoder.  The classifier is 1-to-1 so no possibility of generalizing. Just verifying my training loop and RNN.

The data is from [fastai book chap 12](https://github.com/fastai/fastbook/blob/master/12_nlp_dive.ipynb). Looks like:

```
one 
two 
three 
...
two hundred seven 
two hundred eight 
...
```

In [1]:
from fastai2.text.all import *
path = untar_data(URLs.HUMAN_NUMBERS)
path

Path('/home/parrt/.fastai/data/human_numbers')

## Support

In [2]:
import codecs
import os
import re
import string
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
#from torch.nn.functional import softmax
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

dtype = torch.float
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [3]:
def get_text(filename:str):
    """
    Load and return the text of a text file, assuming latin-1 encoding as that
    is what the BBC corpus uses.  Use codecs.open() function not open().
    """
    f = codecs.open(filename, encoding='latin-1', mode='r')
    s = f.read()
    f.close()
    return s

In [4]:
def getvocab(strings):
    letters = [list(l) for l in strings]
    vocab = set([c for cl in letters for c in cl])
    vocab = sorted(list(vocab))
    ctoi = {c:i for i, c in enumerate(vocab)}
    return vocab, ctoi

In [5]:
def softmax(y):
    expy = torch.exp(y)
    if len(y.shape)==1: # 1D case can't use axis arg
        return expy / torch.sum(expy)
    return expy / torch.sum(expy, axis=1).reshape(-1,1)

In [6]:
def get_max_len(X):
    max_len = 0
    for x in X:
        max_len = max(max_len, len(x))
    return max_len

## Load

In [7]:
text = get_text(path/'train.txt').strip()
print(text[:28])
lines = text.lower().split('\n')
print(lines[:5])

one 
two 
three 
four 
five 
['one ', 'two ', 'three ', 'four ', 'five ']


In [8]:
# get unique vocab but don't sort; keep order so 'one'=1 etc...
# use '#' to indicate padded (unused) char for embedding purposes
v = set('#')
X_vocab = ['#']
for t in text.split():
    if t not in v:
        X_vocab.append(t)
        v.add(t)
X_vocab[:10]

['#', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']

In [9]:
X_tokens = [line.strip().split(' ') for line in lines]
X_tokens[18:23]

[['nineteen'],
 ['twenty'],
 ['twenty', 'one'],
 ['twenty', 'two'],
 ['twenty', 'three']]

In [10]:
X_vocab = {w:i for i,w in enumerate(X_vocab)}
X_vocab['#'], X_vocab['one'], X_vocab['two']

(0, 1, 2)

In [11]:
X_max_len = get_max_len(X_tokens)
X = torch.zeros(len(X_tokens), X_max_len, device=device, dtype=torch.long) # zero implies padding
print(X.shape)
for i in range(len(X_tokens)):
    x = X_tokens[i]
    pad = X_max_len - len(x)
    for j in range(len(x)):
        X[i,j+pad] = X_vocab[X_tokens[i][j]]
X

torch.Size([7999, 6])


tensor([[ 0,  0,  0,  0,  0,  1],
        [ 0,  0,  0,  0,  0,  2],
        [ 0,  0,  0,  0,  0,  3],
        ...,
        [ 7, 29,  9, 28, 27,  7],
        [ 7, 29,  9, 28, 27,  8],
        [ 7, 29,  9, 28, 27,  9]], device='cuda:0')

## Classifier

### Create y target class vector

y  is just 1..len(X_tokens)

In [12]:
y = torch.tensor(range(1,len(X_tokens)+1), device=device)
y

tensor([   1,    2,    3,  ..., 7997, 7998, 7999], device='cuda:0')

In [13]:
X_tokens[0], X_tokens[-1]

(['one'], ['seven', 'thousand', 'nine', 'hundred', 'ninety', 'nine'])

In [14]:
n = len(X)

nhidden = 128
batch_size = 32
embed_sz = 10
nbatches = n // batch_size
n = nbatches * batch_size
X = X[0:n]
y = y[0:n]
nclasses = len(X_tokens) # they are unique targets

print(f"{n:,d} training records, batch size {batch_size}, {len(X_vocab)} features (words), {nclasses} target classes, state is {nhidden}-vector")

7,968 training records, batch size 32, 30 features (words), 7999 target classes, state is 128-vector


### Train

In [15]:
def forward(batch_X, max_len:int, vocab:dict):
    H = torch.zeros(nhidden, len(batch_X), device=device, dtype=torch.float64, requires_grad=False)
    for t in range(max_len):
        x_step_t = batch_X[:,t]
        # column E[i] is the embedding for char index i. same as multiple E.mm(onehot(i))
        embedding_step_t = E[:,x_step_t]
        H = W.mm(H) + U.mm(embedding_step_t) + Bx
        H = torch.tanh(H)        
    o = V.mm(H) + Bo
    o = o.T # make it batch_size x nclasses
    return o

In [16]:
#%%time 
#torch.manual_seed(0) # SET SEED FOR TESTING
E = torch.randn(embed_sz,      len(X_vocab),  device=device, dtype=torch.float64, requires_grad=True) # embedding
W = torch.eye(nhidden,         nhidden,       device=device, dtype=torch.float64, requires_grad=True)
U = torch.randn(nhidden,       embed_sz,      device=device, dtype=torch.float64, requires_grad=True) # input converter
Bx = torch.zeros(nhidden,      batch_size,    device=device, dtype=torch.float64, requires_grad=True)
Bo = torch.zeros(nclasses,     batch_size,    device=device, dtype=torch.float64, requires_grad=True)
V = torch.randn(nclasses,      nhidden,       device=device, dtype=torch.float64, requires_grad=True) # take RNN output (h) and predict target

with torch.no_grad():
    E[:,0] = 0.0  # padding word gives 0 vector

optimizer = torch.optim.Adam([E,W,U,V,Bx,Bo], lr=0.005, weight_decay=0.0)
scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, 
                                              mode='triangular2',
                                              step_size_up=4,
                                              base_lr=0.001, max_lr=0.005,
                                              cycle_momentum=False)

history = []
epochs = 70 # gets to 100% at 70 with lr=0.001
epochs = 55 # gets to 100% at 50 with cyclic base_lr=0.001, max_lr=0.005 every 4
for epoch in range(1, epochs+1):
#     print(f"EPOCH {epoch}")
    epoch_training_loss = 0.0
    epoch_training_accur = 0.0
    total = 0
    for p in range(0, n, batch_size):  # do one epoch
        loss = 0
        batch_X = X[p:p+batch_size]
        batch_y = y[p:p+batch_size]
        o = forward(batch_X, X_max_len, X_vocab)
        correct = torch.argmax(softmax(o), dim=1)==batch_y
        epoch_training_accur += torch.sum(correct)

        loss = F.cross_entropy(o, batch_y)
#         print(loss.item())
        total += len(batch_y)

        # update matrices based upon loss computed from a batch
        optimizer.zero_grad()
        loss.backward() # autograd computes U.grad, M.grad, ...
        optimizer.step()

        epoch_training_loss += loss.detach().item()

    scheduler.step()
    epoch_training_loss /= nbatches
    epoch_training_accur /= n
    print(f"Epoch {epoch:3d} training loss {epoch_training_loss:7.4f} accur {epoch_training_accur:7.4f}   LR {scheduler.get_last_lr()[0]:7.6f}")

Epoch   1 training loss 28.5090 accur  0.0001   LR 0.002000
Epoch   2 training loss 19.8968 accur  0.0003   LR 0.003000
Epoch   3 training loss 15.1256 accur  0.0003   LR 0.004000


KeyboardInterrupt: 

## Translation

### Define y sequence of digits

X is same input but let's use Y as matrix with target of `'one' -> '1'`, `['twenty', 'three'] -> ['2','3']`, etc...

Use '!' for start of sequence and '#' for padding.



In [17]:
Y_vocab = {d:i for i,d in enumerate("0123456789#!")}
Y_vocab

{'0': 0,
 '1': 1,
 '2': 2,
 '3': 3,
 '4': 4,
 '5': 5,
 '6': 6,
 '7': 7,
 '8': 8,
 '9': 9,
 '#': 10,
 '!': 11}

In [38]:
Ystr = ['!'+str(i+1) for i in range(0,len(X))]
Y_max_len = get_max_len(Ystr)
Ystr[0:5]

['!1', '!2', '!3', '!4', '!5']

In [29]:
Y = []
for i in range(0,len(X)):
    y = Ystr[i]
    pad = Y_max_len - len(y)
    Y.append([Y_vocab['#']]*pad+[Y_vocab[d] for d in y])
Y = torch.tensor(Y)
X[0],Y[0]

(tensor([0, 0, 0, 0, 0, 1], device='cuda:0'), tensor([10, 10, 10, 11,  1]))

In [30]:
Y[0:5]

tensor([[10, 10, 10, 11,  1],
        [10, 10, 10, 11,  2],
        [10, 10, 10, 11,  3],
        [10, 10, 10, 11,  4],
        [10, 10, 10, 11,  5]])

In [31]:
nhidden = 128
nclasses = len(Y_vocab) # char output vocab

print(f"{n:,d} training records, batch size {batch_size}, {nclasses} target classes, state is {nhidden}-vector")

7,968 training records, batch size 32, 12 target classes, state is 128-vector


### Split out validation set

Not sure this will generalize but...

In [32]:
#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

### Train

In [33]:
def forward(batch_X, X_max_len:int, batch_Y, Y_max_len):
    # ENCODER
    H = torch.zeros(nhidden, len(batch_X), device=device, dtype=torch.float64, requires_grad=False)
    for t in range(X_max_len):
        x_step_t = batch_X[:,t]
        # column E[i] is the embedding for char index i. same as multiple E.mm(onehot(i))
        embedding_step_t = XE[:,x_step_t]
        H = W.mm(H) + U.mm(embedding_step_t) + Bx
        H = torch.tanh(H)      
    C = H  # H is batch of context vectors for decoder

    # DECODER
    H = torch.zeros(nhidden, len(batch_Y), device=device, dtype=torch.float64, requires_grad=False)
    for t in range(Y_max_len):
        y_step_t = batch_Y[:,t]
        # column E[i] is the embedding for char index i. same as multiple E.mm(onehot(i))
        embedding_step_t = YE[:,y_step_t]
        H = W.mm(H) + U.mm(embedding_step_t) + By
        H = torch.tanh(H)

    o = V.mm(H) + Bo
    o = o.T # make it batch_size x nclasses
    return o

In [34]:
#%%time 
#torch.manual_seed(0) # SET SEED FOR TESTING
XE = torch.randn(embed_sz,     len(X_vocab),  device=device, dtype=torch.float64, requires_grad=True) # embedding
YE = torch.randn(embed_sz,     len(Y_vocab),  device=device, dtype=torch.float64, requires_grad=True) # embedding
W = torch.eye(nhidden,         nhidden,       device=device, dtype=torch.float64, requires_grad=True)
U = torch.randn(nhidden,       embed_sz,      device=device, dtype=torch.float64, requires_grad=True) # input converter
Bx = torch.zeros(nhidden,      batch_size,    device=device, dtype=torch.float64, requires_grad=True)
By = torch.zeros(nhidden,      batch_size,    device=device, dtype=torch.float64, requires_grad=True)
Bo = torch.zeros(nclasses,     batch_size,    device=device, dtype=torch.float64, requires_grad=True)
V = torch.randn(nclasses,      nhidden,       device=device, dtype=torch.float64, requires_grad=True) # take RNN output (h) and predict target

with torch.no_grad():
    XE[:,X_vocab['#']] = 0.0  # padding word gives 0 vector
    YE[:,Y_vocab['#']] = 0.0  # padding word gives 0 vector

optimizer = torch.optim.Adam([XE,YE,W,U,V,Bx,By,Bo], lr=0.001, weight_decay=0.0)
# scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, 
#                                               mode='triangular2',
#                                               step_size_up=4,
#                                               base_lr=0.001, max_lr=0.005,
#                                               cycle_momentum=False)

history = []
epochs = 70 # gets to 100% at 70 with lr=0.001
epochs = 55 # gets to 100% at 50 with cyclic base_lr=0.001, max_lr=0.005 every 4
for epoch in range(1, epochs+1):
#     print(f"EPOCH {epoch}")
    epoch_training_loss = 0.0
    epoch_training_accur = 0.0
    total = 0
    for p in range(0, n, batch_size):  # do one epoch
        loss = 0
        batch_X = X[p:p+batch_size]
        batch_Y = Y[p:p+batch_size]
        o = forward(batch_X, X_max_len, batch_Y, Y_max_len)
        correct = torch.argmax(softmax(o), dim=1)==batch_y
        epoch_training_accur += torch.sum(correct)

        loss = F.cross_entropy(o, batch_Y[1:])
#         print(loss.item())
        total += len(batch_y)

        # update matrices based upon loss computed from a batch
        optimizer.zero_grad()
        loss.backward() # autograd computes U.grad, M.grad, ...
        optimizer.step()

        epoch_training_loss += loss.detach().item()

    scheduler.step()
    epoch_training_loss /= nbatches
    epoch_training_accur /= n
    print(f"Epoch {epoch:3d} training loss {epoch_training_loss:7.4f} accur {epoch_training_accur:7.4f}   LR {scheduler.get_last_lr()[0]:7.6f}")

Epoch   1 training loss     nan accur  0.0000   LR 0.005000
Epoch   2 training loss     nan accur  0.0000   LR 0.004000


KeyboardInterrupt: 

In [36]:
 batch_Y[:,1:]

tensor([[7, 1, 0, 5],
        [7, 1, 0, 6],
        [7, 1, 0, 7],
        [7, 1, 0, 8],
        [7, 1, 0, 9],
        [7, 1, 1, 0],
        [7, 1, 1, 1],
        [7, 1, 1, 2],
        [7, 1, 1, 3],
        [7, 1, 1, 4],
        [7, 1, 1, 5],
        [7, 1, 1, 6],
        [7, 1, 1, 7],
        [7, 1, 1, 8],
        [7, 1, 1, 9],
        [7, 1, 2, 0],
        [7, 1, 2, 1],
        [7, 1, 2, 2],
        [7, 1, 2, 3],
        [7, 1, 2, 4],
        [7, 1, 2, 5],
        [7, 1, 2, 6],
        [7, 1, 2, 7],
        [7, 1, 2, 8],
        [7, 1, 2, 9],
        [7, 1, 3, 0],
        [7, 1, 3, 1],
        [7, 1, 3, 2],
        [7, 1, 3, 3],
        [7, 1, 3, 4],
        [7, 1, 3, 5],
        [7, 1, 3, 6]])