# RNN Encoder-decoder non-vectorized

use fastai human numbers data to train. First a classifier then do decoder.  The classifier is 1-to-1 so no possibility of generalizing. Just verifying my training loop and RNN. (Classifier is vectorized but not the encoder-decoder)

The data is from [fastai book chap 12](https://github.com/fastai/fastbook/blob/master/12_nlp_dive.ipynb). Looks like:

```
one 
two 
three 
...
two hundred seven 
two hundred eight 
...
```

In [1]:
from fastai2.text.all import *
path = untar_data(URLs.HUMAN_NUMBERS)
path

Path('/home/parrt/.fastai/data/human_numbers')

## Support

In [2]:
import codecs
import os
import re
import string
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
#from torch.nn.functional import softmax
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

dtype = torch.float
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [3]:
def get_text(filename:str):
    """
    Load and return the text of a text file, assuming latin-1 encoding as that
    is what the BBC corpus uses.  Use codecs.open() function not open().
    """
    f = codecs.open(filename, encoding='latin-1', mode='r')
    s = f.read()
    f.close()
    return s

In [4]:
def getvocab(strings):
    letters = [list(l) for l in strings]
    vocab = set([c for cl in letters for c in cl])
    vocab = sorted(list(vocab))
    ctoi = {c:i for i, c in enumerate(vocab)}
    return vocab, ctoi

In [5]:
def softmax(y):
    expy = torch.exp(y)
    if len(y.shape)==1: # 1D case can't use axis arg
        return expy / torch.sum(expy)
    return expy / torch.sum(expy, axis=1).reshape(-1,1)

In [6]:
def get_max_len(X):
    max_len = 0
    for x in X:
        max_len = max(max_len, len(x))
    return max_len

## Load

In [7]:
text = get_text(path/'train.txt').strip()
print(text[:28])
lines = text.lower().split('\n')
print(lines[:5])

one 
two 
three 
four 
five 
['one ', 'two ', 'three ', 'four ', 'five ']


In [8]:
# get unique vocab but don't sort; keep order so 'one'=1 etc...
# use '#' to indicate padded (unused) char for embedding purposes
v = set('#')
X_vocab = ['#']
for t in text.split():
    if t not in v:
        X_vocab.append(t)
        v.add(t)
X_vocab[:10]

['#', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']

In [9]:
X_tokens = [line.strip().split(' ') for line in lines]
X_tokens[18:23]

[['nineteen'],
 ['twenty'],
 ['twenty', 'one'],
 ['twenty', 'two'],
 ['twenty', 'three']]

In [10]:
n = len(X_vocab)
X_vocab = {w:i for i,w in enumerate(X_vocab)}
X_vocab['one'], X_vocab['two']

(1, 2)

In [11]:
# numericalize but don't pad
X = []
for i in range(len(X_tokens)):
    x = X_tokens[i]
    X.append( [X_vocab[w] for w in x] )
X[15:30]

[[16],
 [17],
 [18],
 [19],
 [20],
 [20, 1],
 [20, 2],
 [20, 3],
 [20, 4],
 [20, 5],
 [20, 6],
 [20, 7],
 [20, 8],
 [20, 9],
 [21]]

## Translation

### Define y sequence of digits

Let's use Y as list of lists like X; targets like `'one' -> '1'`, `['twenty', 'three'] -> ['2','3']`, etc...

Use '<' for start of sequence and '>' for end. So sequence `ab` is stored `<ab>`.



In [12]:
Y_vocab = {d:i for i,d in enumerate("0123456789<>")}
Y_vocab

{'0': 0,
 '1': 1,
 '2': 2,
 '3': 3,
 '4': 4,
 '5': 5,
 '6': 6,
 '7': 7,
 '8': 8,
 '9': 9,
 '<': 10,
 '>': 11}

In [13]:
Ystr = [f"<{i+1}>" for i in range(0,len(X))]
Y_max_len = get_max_len(Ystr)
Ystr[:11]

['<1>', '<2>', '<3>', '<4>', '<5>', '<6>', '<7>', '<8>', '<9>', '<10>', '<11>']

In [14]:
Y = []
for i in range(0,len(X)):
    y = Ystr[i]
#    pad = Y_max_len - len(y)
    Y.append([Y_vocab[d] for d in y])
#Y = torch.tensor(Y)
Y[19:25]

[[10, 2, 0, 11],
 [10, 2, 1, 11],
 [10, 2, 2, 11],
 [10, 2, 3, 11],
 [10, 2, 4, 11],
 [10, 2, 5, 11]]

In [15]:
embed_sz = 20
y_embed_sz = 5
nhidden = 128
nclasses = len(Y_vocab) # char output vocab

print(f"{n:,d} training records, {nclasses} target classes, state is {nhidden}-vector")

30 training records, 12 target classes, state is 128-vector


### Split out validation set

Not sure this will generalize but...

In [16]:
#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

### Train

In [17]:
def forward(batch_X, X_max_len:int, batch_Y, Y_max_len):
    # ENCODER
    H = torch.zeros(nhidden, len(batch_X), device=device, dtype=torch.float64, requires_grad=False)
    for t in range(X_max_len):
        x_step_t = batch_X[:,t]
        # column E[i] is the embedding for char index i. same as multiple E.mm(onehot(i))
        embedding_step_t = XE[:,x_step_t]
        H = W.mm(H) + U.mm(embedding_step_t) + Bx
        H = torch.tanh(H)      
    C = H  # H is batch of context vectors for decoder

    # DECODER
    H = torch.zeros(nhidden, len(batch_Y), device=device, dtype=torch.float64, requires_grad=False)
    for t in range(Y_max_len):
        y_step_t = batch_Y[:,t]
        # column E[i] is the embedding for char index i. same as multiple E.mm(onehot(i))
        embedding_step_t = YE[:,y_step_t]
        H = W.mm(H) + U.mm(embedding_step_t) + By
        H = torch.tanh(H)

    o = V.mm(H) + Bo
    o = o.T # make it batch_size x nclasses
    return o

In [39]:
#%%time 
#torch.manual_seed(0) # SET SEED FOR TESTING
Ex = torch.randn(embed_sz,     len(X_vocab),  device=device, dtype=torch.float64, requires_grad=True) # embedding
W = torch.eye(nhidden,         nhidden,       device=device, dtype=torch.float64, requires_grad=True)
U = torch.randn(nhidden,       embed_sz,      device=device, dtype=torch.float64, requires_grad=True) # input converter
bx = torch.zeros(nhidden,      1,             device=device, dtype=torch.float64, requires_grad=True)
by = torch.zeros(nhidden,      1,             device=device, dtype=torch.float64, requires_grad=True)
bo = torch.zeros(nclasses,     1,             device=device, dtype=torch.float64, requires_grad=True)

Ey = torch.randn(y_embed_sz,   len(Y_vocab),  device=device, dtype=torch.float64, requires_grad=True) # embedding
W2 = torch.eye(nhidden,        nhidden,       device=device, dtype=torch.float64, requires_grad=True)
#C = torch.randn(nhidden,       nhidden,       device=device, dtype=torch.float64, requires_grad=True) # input converter
U2 = torch.randn(nhidden,      y_embed_sz,    device=device, dtype=torch.float64, requires_grad=True) # input converter
V = torch.randn(nclasses,      nhidden,       device=device, dtype=torch.float64, requires_grad=True)

optimizer = torch.optim.Adam([Ex,W,U,Ey,W2,U2,V,bx,by,bo], lr=0.001, weight_decay=0.0)
scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, 
                                              mode='triangular2',
                                              step_size_up=4,
                                              base_lr=0.001, max_lr=0.005,
                                              cycle_momentum=False)

torch.autograd.set_detect_anomaly(True)

history = []
epochs = 70 # gets to 100% at 70 with lr=0.001
epochs = 55 # gets to 100% at 50 with cyclic base_lr=0.001, max_lr=0.005 every 4
for epoch in range(1, epochs+1):
    epoch_training_loss = 0.0
    epoch_training_accur = 0.0
    for i in range(n):
        x = X[i]
        y = Y[i]
        # ENCODER
        h = torch.zeros(nhidden, 1, device=device, dtype=torch.float64, requires_grad=False)  # reset hidden state at start of record
        for t in range(len(x)):
            embedding_step_t = Ex[:,x[t]]
            embedding_step_t = embedding_step_t.reshape(embed_sz,1)
            h = W @ h + U @ embedding_step_t + bx
            h = torch.tanh(h)
#         c = h # final h is context for conditioned generator RNN
        
        # DECODER
        loss = 0.0
#         h2 = torch.zeros(nhidden, 1, device=device, dtype=torch.float64, requires_grad=False)  # reset hidden state at start of record
        correct = 0
        for t in range(len(y)-1): # don't predict next char at final '>'
            embedding_step_t = Ey[:,y[t]]
            embedding_step_t = embedding_step_t.reshape(y_embed_sz,1)
            h = W2 @ h + U2 @ embedding_step_t + by
            h = torch.tanh(h)
            o = V @ h + bo
            o = o.reshape(1,nclasses)
            # From y we want to predict y[1:]. at y[t], predict y[t+1]
            loss += F.cross_entropy(o, torch.tensor([y[t+1]], device=device))

            p = softmax(o)
            epoch_training_accur += torch.argmax(p[0]).item()==y[t+1]

        # update matrices based upon loss
        optimizer.zero_grad()
        loss.backward(retain_graph=True)
        optimizer.step()

        epoch_training_loss += loss.detach().item()

    scheduler.step()
    epoch_training_loss /= n
    epoch_training_accur /= n
    print(f"Epoch {epoch:3d} training loss {epoch_training_loss:7.4f} accur {epoch_training_accur:7.4f}   LR {scheduler.get_last_lr()[0]:7.6f}")

Epoch   1 training loss 28.5571 accur  0.6667   LR 0.002000
Epoch   2 training loss 10.9492 accur  1.3333   LR 0.003000
Epoch   3 training loss  2.4715 accur  2.2000   LR 0.004000
Epoch   4 training loss  1.5897 accur  2.2667   LR 0.005000
Epoch   5 training loss  3.9517 accur  2.2000   LR 0.004000
Epoch   6 training loss  5.5824 accur  1.8667   LR 0.003000
Epoch   7 training loss  1.7159 accur  2.4333   LR 0.002000
Epoch   8 training loss  0.4240 accur  2.6000   LR 0.001000
Epoch   9 training loss  0.0311 accur  2.7000   LR 0.001500
Epoch  10 training loss  0.0100 accur  2.7000   LR 0.002000
Epoch  11 training loss  0.0076 accur  2.7000   LR 0.002500
Epoch  12 training loss  0.0060 accur  2.7000   LR 0.003000
Epoch  13 training loss  0.0049 accur  2.7000   LR 0.002500
Epoch  14 training loss  0.0041 accur  2.7000   LR 0.002000
Epoch  15 training loss  0.0036 accur  2.7000   LR 0.001500
Epoch  16 training loss  0.0033 accur  2.7000   LR 0.001000
Epoch  17 training loss  0.0032 accur  2

KeyboardInterrupt: 

In [None]:
 batch_Y[:,1:]