# Human numbers refactored

In [1]:
from fastai2.text.all import *
path = untar_data(URLs.HUMAN_NUMBERS)
path

Path('/Users/parrt/.fastai/data/human_numbers')

## Support code

In [2]:
import pandas as pd
import numpy as np
import math
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
np.set_printoptions(precision=2, suppress=True, linewidth=3000, threshold=20000)
from typing import Sequence
import re
import codecs

dtype = torch.float
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [3]:
def get_text(filename:str):
    """
    Load and return the text of a text file, assuming latin-1 encoding as that
    is what the BBC corpus uses.  Use codecs.open() function not open().
    """
    f = codecs.open(filename, encoding='latin-1', mode='r')
    s = f.read()
    f.close()
    return s

In [4]:
def getvocab(strings):
    letters = [list(l) for l in strings]
    vocab = set([c for cl in letters for c in cl])
    vocab = sorted(list(vocab))
    ctoi = {c:i for i, c in enumerate(vocab)}
    return vocab, ctoi

In [5]:
def get_max_len(X):
    max_len = 0
    for x in X:
        max_len = max(max_len, len(x))
    return max_len

In [6]:
class Embedding:
    def __init__(self, input_size, embed_sz):
        self.E = torch.randn(embed_sz, input_size, device=device, dtype=torch.float64, requires_grad=True) # embedding
        self.input_size = input_size
        self.embed_sz = embed_sz
#         with torch.no_grad():
#             self.E *= 0.01
    def parameters(self): return [self.E]
    def __call__(self, x):
        if isinstance(x, int):
            return self.E[:,x].reshape(self.embed_sz, 1)
        # column E[i] is the embedding for char index i. same as multiple E.mm(onehot(i))
        return self.E[:,x]

In [7]:
class RNN:
    def __init__(self, input_sz, nhidden):
        self.W = torch.eye(nhidden,    nhidden,  device=device, dtype=torch.float64, requires_grad=True)
        self.U = torch.randn(nhidden,  input_sz, device=device, dtype=torch.float64, requires_grad=True)
        self.bx = torch.zeros(nhidden, 1,        device=device, dtype=torch.float64, requires_grad=True)
#         with torch.no_grad():
#             self.W *= 0.01
#             self.U *= 0.01
    def parameters(self): return [self.W, self.U, self.bx]
    def __call__(self, h, x):
        h = self.W@h + self.U@x + self.bx
        h = torch.tanh(h)
        return h

In [8]:
class DecoderRNN(RNN):
    def __init__(self, input_sz, context_sz, nhidden):
        super().__init__(input_sz, nhidden)
        self.C = torch.eye(nhidden,    context_sz, device=device, dtype=torch.float64, requires_grad=True)
    def parameters(self): return super().parameters()+[self.C]
    def __call__(self, h, c, x):
        h = self.W@h + self.C@c + self.U@x + self.bx
        h = torch.tanh(h)
        return h    

In [9]:
class GRU:
    def __init__(self, input_sz, nhidden):
        self.Whz  = torch.eye(nhidden,   nhidden,  device=device, dtype=torch.float64, requires_grad=True)
        self.Whr  = torch.eye(nhidden,   nhidden,  device=device, dtype=torch.float64, requires_grad=True)
        self.Whh_ = torch.eye(nhidden,   nhidden,  device=device, dtype=torch.float64, requires_grad=True)
        self.Uxh_ = torch.randn(nhidden, input_sz, device=device, dtype=torch.float64, requires_grad=True)
        self.Uxz  = torch.randn(nhidden, input_sz, device=device, dtype=torch.float64, requires_grad=True)
        self.Uxr  = torch.randn(nhidden, input_sz, device=device, dtype=torch.float64, requires_grad=True)
    def parameters(self):
        return [self.Whz, self.Whr, self.Whh_, self.Uxh_, self.Uxz, self.Uxr]
    
    def __call__(self, h, x):
        z = torch.sigmoid(self.Whz@h + self.Uxz@x)
        r = torch.sigmoid(self.Whr@h + self.Uxr@x)
        h_ = torch.tanh(self.Whh_@(r*h) + self.Uxh_@x)
        h = torch.tanh( (1-z)*h + z*h_ )
        return h

In [10]:
class Linear:
    def __init__(self, input_size, output_size):
        self.V = torch.randn(output_size,  input_size, device=device, dtype=torch.float64, requires_grad=True)
        self.by = torch.zeros(output_size, 1,          device=device, dtype=torch.float64, requires_grad=True)
#         with torch.no_grad():
#             self.V *= 0.01
    def parameters(self): return [self.V, self.by]
    def __call__(self, h):
        o = self.V@h + self.by
        o = o.T # make it input_size x output_size
        return o

## Load and prepare

In [11]:
text = get_text(path/'train.txt').strip()
print(text[:28])
lines = text.lower().split('\n')
print(lines[:5])

one 
two 
three 
four 
five 
['one ', 'two ', 'three ', 'four ', 'five ']


In [12]:
lines = lines[0:1000]

In [13]:
# get unique vocab but don't sort; keep order so 'one'=1 etc...
# use '#' to indicate padded (unused) char for embedding purposes
v = set('#')
X_vocab = ['#']
for t in text.split():
    if t not in v:
        X_vocab.append(t)
        v.add(t)
X_vocab[:10]

['#', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']

In [14]:
X_tokens = [line.strip().split(' ') for line in lines]
X_tokens[18:23]

[['nineteen'],
 ['twenty'],
 ['twenty', 'one'],
 ['twenty', 'two'],
 ['twenty', 'three']]

In [15]:
n = len(X_tokens)
X_vocab = [w for i,w in enumerate(X_vocab)]
X_ctoi = {w:i for i,w in enumerate(X_vocab)}
X_ctoi['one'], X_ctoi['eleven'], X_vocab[1], X_vocab[11]

(1, 11, 'one', 'eleven')

In [16]:
# numericalize but don't pad
X = []
for i in range(len(X_tokens)):
    x = X_tokens[i]
    X.append( [X_ctoi[w] for w in x] )
X[15:30]

[[16],
 [17],
 [18],
 [19],
 [20],
 [20, 1],
 [20, 2],
 [20, 3],
 [20, 4],
 [20, 5],
 [20, 6],
 [20, 7],
 [20, 8],
 [20, 9],
 [21]]

In [17]:
' '.join(X_vocab)

'# one two three four five six seven eight nine ten eleven twelve thirteen fourteen fifteen sixteen seventeen eighteen nineteen twenty thirty forty fifty sixty seventy eighty ninety hundred thousand'

### Define y sequence of digits

Let's use Y as list of lists like X; targets like `'one' -> '1'`, `['twenty', 'three'] -> ['2','3']`, etc...

Use '<' for start of sequence and '>' for end. So sequence `ab` is stored `<ab>`.



In [18]:
Y_vocab = [w for i,w in enumerate("0123456789<>")]
Y_ctoi = {d:i for i,d in enumerate("0123456789<>")}
Y_ctoi

{'0': 0,
 '1': 1,
 '2': 2,
 '3': 3,
 '4': 4,
 '5': 5,
 '6': 6,
 '7': 7,
 '8': 8,
 '9': 9,
 '<': 10,
 '>': 11}

In [19]:
Ystr = [f"<{i+1}>" for i in range(0,len(X))]
Y_max_len = get_max_len(Ystr)
Ystr[:11]

['<1>', '<2>', '<3>', '<4>', '<5>', '<6>', '<7>', '<8>', '<9>', '<10>', '<11>']

In [20]:
Y = []
for i in range(0,len(X)):
    y = Ystr[i]
    Y.append([Y_ctoi[d] for d in y])
Y[19:25]

[[10, 2, 0, 11],
 [10, 2, 1, 11],
 [10, 2, 2, 11],
 [10, 2, 3, 11],
 [10, 2, 4, 11],
 [10, 2, 5, 11]]

In [21]:
embed_sz = 10
y_embed_sz = 5
nhidden = 256

n = len(X)
nclasses = len(Y_vocab) # char output vocab

print(f"{n:,d} training records, {embed_sz} input embedding size, {y_embed_sz} output embedding size, {nclasses} target classes, state is {nhidden}-vector")

1,000 training records, 10 input embedding size, 5 output embedding size, 12 target classes, state is 256-vector


In [22]:
class Transducer:
    def __init__(self, input_sz, output_sz, input_embed_sz, output_embed_sz, nhidden, useGRU=False):
        self.embx = Embedding(input_sz, input_embed_sz)
        self.emby = Embedding(output_sz, output_embed_sz)
        self.lin = Linear(nhidden, output_sz)
        if useGRU:
            self.encoder = GRU(input_embed_sz, nhidden)
            self.rnn2 = GRU(output_embed_sz+nhidden, nhidden)
#             self.rnn2 = GRU(output_embed_sz, nhidden)
        else:
            self.encoder = RNN(input_embed_sz, nhidden)
            self.decoder = DecoderRNN(output_embed_sz, nhidden, nhidden)
        
    def parameters(self):
        return self.embx.parameters()+\
               self.emby.parameters()+\
               self.lin.parameters()+\
               self.encoder.parameters()+\
               self.decoder.parameters()

    def __call__(self, x, y):
        # ENCODER
        h = torch.zeros(nhidden, 1, device=device, dtype=torch.float64, requires_grad=False)  # reset hidden state at start of record
        for t in range(len(x)):
            embedding_step_t = self.embx(x[t])
            h = self.encoder(h, embedding_step_t)
        c = h

        # DECODER
        output = []
        loss = 0.0
        correct = 0
        h = torch.zeros(nhidden, 1, device=device, dtype=torch.float64, requires_grad=False)  # reset hidden state at start of record
#         h = c
        for t in range(len(y)-1): # don't predict next char at final '>'
            embedding_step_t = self.emby(y[t])
            h = self.decoder(h, c, embedding_step_t)
            o = self.lin(h)
            # From y we want to predict y[1:]. at y[t], predict y[t+1] using c as context vector
            loss += F.cross_entropy(o, torch.tensor([y[t+1]], device=device), reduction="sum")

            p = F.softmax(o[0], dim=0)
            y_pred = torch.argmax(p).item()
            correct += y_pred==y[t+1]
            output.append(y_pred)
    
        if correct>0:
            correct -= 1 # don't count getting final '>' correct
        return output, loss, int(correct)
    
    def predict(self, x, Y_ctoi):
        # ENCODER
        h = torch.zeros(nhidden, 1, device=device, dtype=torch.float64, requires_grad=False)  # reset hidden state at start of record
        for t in range(len(x)):
            embedding_step_t = self.embx(x[t])
            h = self.encoder(h, embedding_step_t)
        c = h

        # DECODER
        loss = 0.0
        output = []
        y_pred = Y_ctoi['<'] # begin with "start of sequence" char
        output.append(y_pred)
        h = torch.zeros(nhidden, 1, device=device, dtype=torch.float64, requires_grad=False)  # reset hidden state at start of record
#         h = c
        MAX_LENGTH = 20 # for safety
        while y_pred!=Y_ctoi['>'] and len(output)<=MAX_LENGTH:
            embedding_step_t = self.emby(y_pred)
            h = self.decoder(h, c, embedding_step_t)
            o = self.lin(h)
            p = F.softmax(o[0], dim=0)
            y_pred = torch.argmax(p).item()
            output.append(y_pred)
        return output

In [23]:
def xstr(x):
    return ''.join([X_vocab[v] for v in x])
def ystr(y):
    return ''.join([Y_vocab[v] for v in y])

In [24]:
trans = Transducer(input_sz=len(X_ctoi),
                   output_sz=len(Y_ctoi),
                   input_embed_sz=embed_sz,
                   output_embed_sz=y_embed_sz,
                   nhidden=nhidden,
                   useGRU=False)
parameters = trans.parameters()
optimizer = torch.optim.Adam(parameters, lr=0.0005, weight_decay=0.0)
scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, 
                                              mode='triangular2',
                                              step_size_up=4,
                                              base_lr=0.0003, max_lr=0.001,
                                              cycle_momentum=False)
history = []
epochs = 10
for epoch in range(1, epochs+1):
    epoch_training_loss = 0.0
    epoch_training_accur = 0.0
    total_compares = 0
    for i in range(n):
        x = X[i]
        y = Y[i]
        y_pred, loss, correct = trans(x, y)
#         if epoch==10:
#             print(f"{tostr(x)}->{tostr(y)}: {tostr(y_pred)}, {correct} correct")
        epoch_training_accur += correct
        epoch_training_loss += loss.detach().item()
        total_compares += len(y) - 2  # From "<foo>" predict "foo>" but don't count last '>' for metrics

        optimizer.zero_grad()
        loss.backward() # autograd computes U.grad, M.grad, ...
        optimizer.step()
        
#         if t % bptt == 0 and t > 0:
#             optimizer.zero_grad()
#             loss.backward() # autograd computes U.grad, M.grad, ...
#             optimizer.step()
#             epoch_training_loss += loss.detach().item()
#             loss = 0
#             H = H.detach() # no longer consider previous computations

    epoch_training_accur /= total_compares
    epoch_training_loss /= total_compares
    
#     print(f"Epoch {epoch:3d} training loss {epoch_training_loss:8.3f}   accur {epoch_training_accur:7.4f}")
    print(f"Epoch {epoch:3d} training loss {epoch_training_loss:8.3f}   accur {epoch_training_accur:7.4f}   LR {scheduler.get_last_lr()[0]:7.6f}")
    scheduler.step()    

Epoch   1 training loss    6.782   accur  0.4072   LR 0.000300
Epoch   2 training loss    2.247   accur  0.6654   LR 0.000475
Epoch   3 training loss    1.727   accur  0.7359   LR 0.000650
Epoch   4 training loss    1.378   accur  0.7916   LR 0.000825
Epoch   5 training loss    1.335   accur  0.8144   LR 0.001000
Epoch   6 training loss    0.765   accur  0.8828   LR 0.000825
Epoch   7 training loss    0.495   accur  0.9219   LR 0.000650
Epoch   8 training loss    0.243   accur  0.9492   LR 0.000475
Epoch   9 training loss    0.063   accur  0.9813   LR 0.000300
Epoch  10 training loss    0.018   accur  0.9945   LR 0.000387


In [25]:
parameters = trans.parameters()
print(len(parameters), "parameters")
[p.shape for p in parameters]

11 parameters


[torch.Size([10, 30]),
 torch.Size([5, 12]),
 torch.Size([12, 256]),
 torch.Size([12, 1]),
 torch.Size([256, 256]),
 torch.Size([256, 10]),
 torch.Size([256, 1]),
 torch.Size([256, 256]),
 torch.Size([256, 5]),
 torch.Size([256, 1]),
 torch.Size([256, 256])]

In [26]:
with torch.no_grad():
    valid_accur = 0
    total_compares = 0
    total_correct = 0
    for i in range(n):
        x = X[i]
        y = Y[i]
#         y_pred = predict(trans, x, Y_ctoi)
        y_pred = trans.predict(x, Y_ctoi)
        total_compares += len(y) - 2 # From "<foo>" predict "foo>" but don't count last '>' for metrics
        correct = ystr(y)==ystr(y_pred)
        total_correct += correct
        if not correct:
            print(f"{xstr(x)}->{ystr(y)}: {ystr(y_pred)}{' MISMATCH' if not correct else ''}")
print(f"{total_correct}/{n} correct")

seventytwo-><72>: <722> MISMATCH
ninetytwo-><92>: <922> MISMATCH
onehundredfive-><105>: <100> MISMATCH
onehundredsixtytwo-><162>: <1622> MISMATCH
sixhundredeight-><608>: <638> MISMATCH
sevenhundredtwentytwo-><722>: <7222> MISMATCH
994/1000 correct


In [27]:
x = [X_ctoi[w] for w in "one".split()]
output = trans.predict(x, Y_ctoi)
print([X_vocab[n] for n in x],'=>', ystr(output))

['one'] => <1>


In [28]:
x = [X_ctoi[w] for w in "one hundred".split()]
output = trans.predict(x, Y_ctoi)
print([X_vocab[n] for n in x],'=>', ystr(output))

['one', 'hundred'] => <100>


In [29]:
x = [X_ctoi[w] for w in "one hundred ten".split()]
output = trans.predict(x, Y_ctoi)
print([X_vocab[n] for n in x],'=>', ystr(output))

['one', 'hundred', 'ten'] => <110>


In [30]:
x = [X_ctoi[w] for w in "one hundred twenty two".split()]
output = trans.predict(x, Y_ctoi)
print([X_vocab[n] for n in x],'=>', ystr(output))

['one', 'hundred', 'twenty', 'two'] => <122>


In [31]:
x = [X_ctoi[w] for w in "eleven".split()]
output = trans.predict(x, Y_ctoi)
print([X_vocab[n] for n in x],'=>', ystr(output))

['eleven'] => <11>


In [32]:
x = [X_ctoi[w] for w in "ninety nine".split()]
output = trans.predict(x, Y_ctoi)
print([X_vocab[n] for n in x],'=>', ystr(output))

['ninety', 'nine'] => <99>


In [33]:
x = [X_ctoi[w] for w in "fifty three".split()]
output = trans.predict(x, Y_ctoi)
print([X_vocab[n] for n in x],'=>', ystr(output))

['fifty', 'three'] => <53>
