# Translation with attention

Let's do French -> English. French has multiple phrases that map to single English phrase so can't do English->French as well. E.g.,

```
Get ready.      Prépare-toi.
Get ready.      Préparez-vous.
```

Attention with GRU is better than context gru.  Attn with RNN is still very good and better than context-GRU.

## Support code

In [38]:
import pandas as pd
import numpy as np
import math
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
np.set_printoptions(precision=2, suppress=True, linewidth=3000, threshold=20000)
from typing import Sequence
import editdistance # Get Levenshtein (pip install editdistance)
import re

dtype = torch.float
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [39]:
def getvocab(strings):
    letters = [list(l) for l in strings]
    vocab = set([c for cl in letters for c in cl])
    vocab = sorted(list(vocab))
    ctoi = {c:i for i, c in enumerate(vocab)}
    return vocab, ctoi

In [40]:
def get_max_len(X):
    max_len = 0
    for x in X:
        max_len = max(max_len, len(x))
    return max_len

## Load and prepare

In [41]:
class Embedding:
    def __init__(self, input_size, embed_sz):
        self.E = torch.randn(embed_sz, input_size, device=device, dtype=torch.float64, requires_grad=True) # embedding
        self.input_size = input_size
        self.embed_sz = embed_sz
#         with torch.no_grad():
#             self.E *= 0.01
    def parameters(self): return [self.E]
    def __call__(self, x):
        if isinstance(x, int):
            return self.E[:,x].reshape(self.embed_sz, 1)
        # column E[i] is the embedding for char index i. same as multiple E.mm(onehot(i))
        return self.E[:,x]

In [42]:
class RNN:
    def __init__(self, input_sz, nhidden):
        self.W = torch.eye(nhidden,    nhidden,  device=device, dtype=torch.float64, requires_grad=True)
        self.U = torch.randn(nhidden,  input_sz, device=device, dtype=torch.float64, requires_grad=True)
        self.bx = torch.zeros(nhidden, 1,        device=device, dtype=torch.float64, requires_grad=True)
#         with torch.no_grad():
#             self.W *= 0.01
#             self.U *= 0.01
    def parameters(self): return [self.W, self.U, self.bx]
    def __call__(self, h, x):
        h = self.W@h + self.U@x + self.bx
        h = torch.tanh(h)
        return h

In [43]:
class DecoderRNN(RNN):
    def __init__(self, input_sz, context_sz, nhidden):
        super().__init__(input_sz, nhidden)
        self.C = torch.eye(nhidden,    context_sz, device=device, dtype=torch.float64, requires_grad=True)
    def parameters(self): return super().parameters()+[self.C]
    def __call__(self, h, c, x):
        h = self.W@h + self.C@c + self.U@x + self.bx
        h = torch.tanh(h)
        return h    

In [44]:
class GRU:
    def __init__(self, input_sz, nhidden, include_bias=False):
        self.Whz  = torch.eye(nhidden,   nhidden,  device=device, dtype=torch.float64, requires_grad=True)
        self.Whr  = torch.eye(nhidden,   nhidden,  device=device, dtype=torch.float64, requires_grad=True)
        self.Whh_ = torch.eye(nhidden,   nhidden,  device=device, dtype=torch.float64, requires_grad=True)
        self.Uxh_ = torch.randn(nhidden, input_sz, device=device, dtype=torch.float64, requires_grad=True)
        self.Uxz  = torch.randn(nhidden, input_sz, device=device, dtype=torch.float64, requires_grad=True)
        self.Uxr  = torch.randn(nhidden, input_sz, device=device, dtype=torch.float64, requires_grad=True)
        # if include_bias these stay 0
        self.bz   = torch.zeros(nhidden, 1,        device=device, dtype=torch.float64, requires_grad=True)
        self.br   = torch.zeros(nhidden, 1,        device=device, dtype=torch.float64, requires_grad=True)
        self.bh_  = torch.zeros(nhidden, 1,        device=device, dtype=torch.float64, requires_grad=True)
        self.include_bias = include_bias
    def parameters(self):
        p = [self.Whz, self.Whr, self.Whh_, self.Uxh_, self.Uxz, self.Uxr]
        if self.include_bias:
            p += [self.bz, self.br, self.bh_]    
        return p
    def __call__(self, h, x):
        z = torch.sigmoid(self.Whz@h    + self.Uxz@x  + self.bz)
        r = torch.sigmoid(self.Whr@h    + self.Uxr@x  + self.br)
        h_ = torch.tanh(self.Whh_@(r*h) + self.Uxh_@x + self.bh_)
#         print(h.shape, z.shape, r.shape, h_.shape)
        h = torch.tanh( (1-z)*h + z*h_ )
        return h

In [45]:
class DecoderGRU(GRU):
    def __init__(self, input_sz, context_sz, nhidden, include_bias=False):
        super().__init__(input_sz, nhidden, include_bias)
        self.C = torch.eye(nhidden,    context_sz, device=device, dtype=torch.float64, requires_grad=True)
    def parameters(self): return super().parameters()+[self.C]
    def __call__(self, h, c, x):
        z = torch.sigmoid(self.Whz@h    + self.C@c + self.Uxz@x  + self.bz)
        r = torch.sigmoid(self.Whr@h    + self.C@c + self.Uxr@x  + self.br)
        h_ = torch.tanh(self.Whh_@(r*h) + self.C@c + self.Uxh_@x + self.bh_)
        h = torch.tanh( (1-z)*h + z*h_ )
        return h

In [46]:
class Linear:
    def __init__(self, input_size, output_size):
        self.V = torch.randn(output_size,  input_size, device=device, dtype=torch.float64, requires_grad=True)
        self.by = torch.zeros(output_size, 1,          device=device, dtype=torch.float64, requires_grad=True)
#         with torch.no_grad():
#             self.V *= 0.01
    def parameters(self): return [self.V, self.by]
    def __call__(self, h):
        o = self.V@h + self.by
        o = o.T # make it input_size x output_size
        return o

In [47]:
class Dropout:
    def __init__(self, p=0.0, fixed=False):
        """
        If fixed, reuse same mask for all future uses of this layer.
        Assumes v columns are the layer activations. If batch size is 1, then this will be a column vector.
        Same column knockout used future invocations if fixed.
        """
        self.p = p
        self.fixed = fixed
        self.mask = None
    def __call__(self, v):
        """
        Column(s) are activation vectors. Get a new column mask and knockout elements with
        it for each column (unless fixed).
        """
        if isinstance(v, list):
            v = torch.tensor(v, device=device)

        if self.fixed:
            if self.mask is None:
                usample = torch.empty_like(v, device=device).uniform_(0, 1)   # get random value for each activation matrix element
                mask = self.mask = (usample>self.p).int()      # get boolean mask as "those with value greater than p"
            else:
                mask = self.mask
        else:
            usample = torch.empty_like(v, device=device).uniform_(0, 1)     # get random value for each activation matrix element
            mask = (usample>self.p).int()                    # get boolean mask as "those with value greater than p"

        v_ = v * mask                                    # kill masked activations
        v_ /= 1 - self.p                                 # scale during training by 1/(1-p) to avoid scaling by p at test time
                                                         # after dropping p activations, (1-p) are left untouched, on average
#             print(v,"becomes",v_)
        return v_

In [48]:
with open("data/eng-fra.txt") as f:
    text = f.read().strip().lower()

# clean up, normalize
text = re.sub(r"[ \u202f\u209f\u20bf\u2009\u3000\xa0]+", " ", text)  # there are lots of space chars in unicode
text = re.sub(r"\u200b|\xad|‐|–", "-", text)  # there are lots of space chars in unicode
text = re.sub(r"‘|’", "'", text)  # there are lots of space chars in unicode
text = text.replace("‽", "?")
text = text.replace("…", "")
text = text.replace("₂", "")
# text = text.replace("\u202f", " ")
# text = text.replace("\u209f", " ")
# text = text.replace("\u20bf", " ")
text = text.replace(" !", "")
text = text.replace(" .", "")
text = re.sub(r"([.!?])", "", text)
lines = text.split("\n")

In [49]:
lines = [line for line in lines if not len(set(line).intersection({'(',')','~','€','$','%','&','/','«','»'}))]
pairs = [line.split('\t') for line in lines]

In [50]:
MAX_LENGTH = 15
pairs = [p for p in pairs if len(p[0])<=MAX_LENGTH and len(p[1])<=MAX_LENGTH]

In [51]:
FILTER = False
if FILTER:
    eng_prefixes = (
        "i am ", "i'm ",
        "he is ", "he's ",
        "she is ", "she's ",
        "you are ", "you're ",
        "we are ", "we're ",
        "they are ", "they're "
        )
    filtered_pairs = []
    for p in pairs:
        en,fr = p
        for pre in eng_prefixes:
            if en.startswith(pre):
                filtered_pairs.append(p)
                break

    pairs = filtered_pairs            

In [52]:
pairs = pairs[0:1000] # testing

In [53]:
pairs = [(p[1],p[0]) for p in pairs]

In [54]:
len(pairs)

1000

In [55]:
# Remove duplicates
pairs = list(dict(pairs).items())
len(pairs)

867

In [56]:
vocab = sorted(set('\n'.join(lines)))
vocab = vocab[2:] # drop \t and \n
vocab = ['<','>']+vocab # add delimiters as 0, 1
ctoi = {c:i for i, c in enumerate(vocab)}

In [57]:
len(vocab)

64

In [58]:
''.join(vocab)

'<> "\'+,-0123456789:;abcdefghijklmnopqrstuvwxyzàâçèéêëîïòôöùúûœас'

In [59]:
pairs[0:10]

[('va', 'go'),
 ('cours', 'run'),
 ('courez', 'run'),
 ('ça alors', 'wow'),
 ('au feu', 'fire'),
 ("à l'aide", 'help'),
 ('saute', 'jump'),
 ('ça suffit', 'stop'),
 ('stop', 'stop'),
 ('arrête-toi', 'stop')]

## Wrap in <...> and Numericalize

In [60]:
pairs = [(f"<{p[0]}>",f"<{p[1]}>") for p in pairs]

In [61]:
pairs[0:5]

[('<va>', '<go>'),
 ('<cours>', '<run>'),
 ('<courez>', '<run>'),
 ('<ça alors>', '<wow>'),
 ('<au feu>', '<fire>')]

In [62]:
X = []
Y = []
for p in pairs:
    fr, en = p
    X.append([ctoi[c] for c in fr])
    Y.append([ctoi[c] for c in en])

In [63]:
X[0:5]

[[0, 41, 20, 1],
 [0, 22, 34, 40, 37, 38, 1],
 [0, 22, 34, 40, 37, 24, 45, 1],
 [0, 48, 20, 2, 20, 31, 34, 37, 38, 1],
 [0, 20, 40, 2, 25, 24, 40, 1]]

In [64]:
Y[0:5]

[[0, 26, 34, 1],
 [0, 37, 40, 33, 1],
 [0, 37, 40, 33, 1],
 [0, 42, 34, 42, 1],
 [0, 25, 28, 37, 24, 1]]

## Split out validation set

In [65]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [66]:
n = len(X_train)
char_embed_sz = 10
nhidden = 300
nclasses = len(vocab) # char output vocab

print(f"{n:,d} training records, {char_embed_sz} embedding size, {nclasses} target classes, state is {nhidden}-vector")

693 training records, 10 embedding size, 64 target classes, state is 300-vector


In [67]:
def tostr(x):
    return ''.join([vocab[v] for v in x])

In [68]:
class Transducer:
    def __init__(self, input_sz, output_sz, input_embed_sz, output_embed_sz, nhidden, 
                 dropout=0.0,
                 method="dot",
                 useGRU=False):
        self.dropout = dropout
        self.method = method
        self.embx  = Embedding(input_sz, input_embed_sz)
        self.emby  = Embedding(output_sz, output_embed_sz)
#         self.postgru = Linear(nhidden, nhidden)
        self.Wattn   = Linear(nhidden, nhidden) # map hidden decoder s to new space
#         self.out   = Linear(nhidden, output_sz)
        self.out   = Linear(nhidden, output_sz)
        self.Wcombine = torch.eye(nhidden, nhidden*2, device=device, dtype=torch.float64, requires_grad=True)
        if useGRU:
            self.encoder = GRU(input_embed_sz, nhidden)
            self.decoder = GRU(output_embed_sz, nhidden)
        else:
            self.encoder = RNN(input_embed_sz, nhidden)
            self.decoder = RNN(output_embed_sz, nhidden)
        
    def parameters(self):
        return self.embx.parameters()+\
               self.emby.parameters()+\
               self.out.parameters()+\
               self.encoder.parameters()+\
               [self.Wcombine]+\
               self.Wattn.parameters()+\
               self.decoder.parameters()
#                self.postgru.parameters()+\

    def __call__(self, x, y):
        x_dropout = Dropout(p=self.dropout, fixed=True)
        y_dropout = Dropout(p=self.dropout, fixed=True)
        z_dropout = Dropout(p=self.dropout, fixed=True)
        
        if isinstance(x, list):
            x = torch.tensor(x)
        if isinstance(y, list):
            y = torch.tensor(y)
            
        assert x.dim()==1 or x.dim()==2
        assert y.dim()==1 or y.dim()==2
        
        if x.dim()==1:
            batch_size = 1
            x = x.reshape(1,-1)
        else:
            batch_size = x.shape[0]
        if y.dim()==1:
            y = y.reshape(1,-1)
            
        m = x.shape[1] # num input symbols
        
        # ENCODER
        encoder_states = []
        h = torch.zeros(nhidden, batch_size, device=device, dtype=torch.float64, requires_grad=False)  # reset hidden state at start of record
        for t in range(m):
            embedding_step_t = self.embx(x[:,t])
            embedding_step_t = x_dropout(embedding_step_t)
            h = self.encoder(h, embedding_step_t)
            encoder_states.append(h)
        encoder_states = torch.cat(encoder_states, dim=1) # columns are h vectors
#         print(encoder_states.shape)
        
        c = h

        # DECODER
        output = []
        loss = 0.0
        correct = 0
        # h is for encoder, s is for decoder
        s = torch.zeros(nhidden, batch_size, device=device, dtype=torch.float64, requires_grad=False)  # reset hidden state at start of record
        for t in range(y.shape[1]-1): # don't predict next char at final '>'
            embedding_step_t = self.emby(y[:,t])
            embedding_step_t = y_dropout(embedding_step_t)
            s = self.decoder(s, embedding_step_t)
            c = self.context(s, encoder_states)
#             print(self.Wcombine.shape, s.shape, c.shape)
            s_ = F.relu( self.Wcombine @ torch.cat([s,c], dim=0) )
            o = self.out(s_)

#             print(embedding_step_t.shape, o.shape, torch.tensor([y[t+1]], device=device).shape)
            o = z_dropout(o)
            # From y we want to predict y[1:]. at y[t], predict y[t+1] using c as context vector
            y_true = torch.tensor(y[:,t+1], device=device).reshape(batch_size)
            loss += F.cross_entropy(o, y_true, reduction="sum")
            p = F.softmax(o, dim=1)
            y_pred = torch.argmax(p, dim=1)#.item()
            correct += torch.sum(y_pred==y[:,t+1])
            output.append(y_pred)
        return output, loss, int(correct)

    def predict(self, x, Y_ctoi):
        if isinstance(x, list):
            x = torch.tensor(x)

        assert x.dim()==1 or x.dim()==2
        
        if x.dim()==1:
            batch_size = 1
            x = x.reshape(1,-1)
        else:
            batch_size = x.shape[0]
            
        # ENCODER
        encoder_states = []
        h = torch.zeros(nhidden, batch_size, device=device, dtype=torch.float64, requires_grad=False)  # reset hidden state at start of record
        for t in range(x.shape[1]):
            embedding_step_t = self.embx(x[:,t])
            h = self.encoder(h, embedding_step_t)
            encoder_states.append(h)
        encoder_states = torch.cat(encoder_states, dim=1) # columns are h vectors
        c = h

        # DECODER
        output = []
        loss = 0.0
        output = []
        y_pred = Y_ctoi['<'] # begin with "start of sequence" char
        output.append(y_pred)
        s = torch.zeros(nhidden, batch_size, device=device, dtype=torch.float64, requires_grad=False)  # reset hidden state at start of record
        MAX = 20 # for safety
        while y_pred!=Y_ctoi['>'] and len(output)<=MAX:
            embedding_step_t = self.emby(y_pred)
            s = self.decoder(s, embedding_step_t)
            c = self.context(s, encoder_states)
            s_ = F.relu( self.Wcombine @ torch.cat([s,c], dim=0) )
            o = self.out(s_)

#             print(embedding_step_t.shape, o.shape, torch.tensor([y[t+1]], device=device).shape)
            p = F.softmax(o, dim=1)
            y_pred = torch.argmax(p, dim=1).item()
            output.append(y_pred)
        return output

    def context(self, s, encoder_states):
        m = encoder_states.shape[1] # how many input symbols        
        encoder_attn = torch.empty(size=(m,), dtype=torch.float64, requires_grad=False)
        if self.method=="general":
            s = self.Wattn(s)
            s = s.T # o is column vector of length nhidden
        for i in range(m): # go through hidden encoder vectors for each input i
            if self.method=="dot":
                # Wow. that math.sqrt(nhidden) seems to help, at least for RNN
                encoder_attn[i] = torch.dot(s.flatten(), encoder_states[:,i]).item()
            elif self.method=="dotscaled":
                # Wow. that math.sqrt(nhidden) seems to help, at least for RNN
                encoder_attn[i] = torch.dot(s.flatten(), encoder_states[:,i]).item() / math.sqrt(nhidden)
            elif self.method=="general":
                encoder_attn[i] = torch.dot(s.flatten(), encoder_states[:,i]).item()
        encoder_attn = F.softmax(encoder_attn, dim=0)
#             print("softmax", encoder_attn)

        # context vector is weighted average of encoder_states
#             print(encoder_states.shape, encoder_attn.T.shape, encoder_attn)
        c = encoder_states @ encoder_attn.reshape(-1,1)
        return c

In [85]:
trans = Transducer(input_sz=len(ctoi),
                   output_sz=len(ctoi),
                   input_embed_sz=char_embed_sz,
                   output_embed_sz=char_embed_sz,
                   nhidden=nhidden,
                   dropout=0.0,
                   method="dotscaled",
                   useGRU=False)
optimizer = torch.optim.Adam(trans.parameters(), lr=0.0005, weight_decay=0.0)
scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, 
                                              mode='triangular2',
                                              step_size_up=4,
#                                               base_lr=0.000001, max_lr=0.0005, # RNN
                                              base_lr=0.00001, max_lr=0.0005, # GRU
                                              cycle_momentum=False)

history = []
epochs = 25
for epoch in range(1, epochs+1):
    epoch_training_loss = 0.0
    epoch_training_accur = 0.0
    total_compares = 0
    for i in torch.randperm(n):
        x = X_train[i]
        y = Y_train[i]
        y_pred, loss, correct = trans(x, y)
#         if epoch==10:
#             print(f"{tostr(x)}->{tostr(y)}: {tostr(y_pred)}, {correct} correct")
        epoch_training_accur += correct
        epoch_training_loss += loss.detach().item()
        total_compares += len(y) - 1  # From "<foo>" predict and count "foo>"

        optimizer.zero_grad()
        loss.backward() # autograd computes U.grad, M.grad, ...
        optimizer.step()
        
    epoch_training_accur /= total_compares
    epoch_training_loss /= total_compares
    
    print(f"Epoch {epoch:3d} training loss {epoch_training_loss:8.3f}   accur {epoch_training_accur:7.4f}   LR {scheduler.get_last_lr()[0]:7.6f}")
    scheduler.step()    

  y_true = torch.tensor(y[:,t+1], device=device).reshape(batch_size)


Epoch   1 training loss   12.184   accur  0.0971   LR 0.000010
Epoch   2 training loss    2.942   accur  0.3472   LR 0.000132
Epoch   3 training loss    2.419   accur  0.4232   LR 0.000255
Epoch   4 training loss    2.327   accur  0.4541   LR 0.000378
Epoch   5 training loss    2.137   accur  0.4742   LR 0.000500
Epoch   6 training loss    1.480   accur  0.5997   LR 0.000378
Epoch   7 training loss    1.063   accur  0.6853   LR 0.000255
Epoch   8 training loss    0.734   accur  0.7786   LR 0.000132
Epoch   9 training loss    0.541   accur  0.8423   LR 0.000010
Epoch  10 training loss    0.562   accur  0.8362   LR 0.000071
Epoch  11 training loss    0.630   accur  0.8062   LR 0.000132
Epoch  12 training loss    0.733   accur  0.7726   LR 0.000194
Epoch  13 training loss    0.840   accur  0.7465   LR 0.000255
Epoch  14 training loss    0.573   accur  0.8243   LR 0.000194
Epoch  15 training loss    0.390   accur  0.8820   LR 0.000132
Epoch  16 training loss    0.282   accur  0.9214   LR 0

In [86]:
# def same(a,b):
#     return sum(c1==c2 for c1,c2 in zip(a,b))

def check(X,Y,verbose=False):
    "Use Levenshtein to measure how close output predictions are to truth."
    with torch.no_grad():
        valid_accur = 0
        total_compares = 0
        total_correct = 0
        total_d = 0
        for i in range(len(X)):
            x = X[i]
            y = Y[i]
            y_pred = trans.predict(x, ctoi)
            total_compares += len(y) - 1 # From "<foo>" predict "foo>" but don't count last '>' for metrics
            total_correct += tostr(y)==tostr(y_pred)
            d = editdistance.eval(tostr(y),tostr(y_pred))
            total_d += d
            if verbose>1 or verbose==1 and d>0:
                print(f"{tostr(x):20s} : {tostr(y)}")
                print(f"{'':20s} : {tostr(y_pred):20s} Levenshtein {d} out of {len(y)}")
    return total_d, total_correct

In [87]:
total_d, total_correct = check(X_train, Y_train, verbose=0)
print(f"Training average Levenshtein score {total_d/len(X_train):8.2f}, perfect accuracy {total_correct/len(X_train):8.2f}")

Training average Levenshtein score     0.98, perfect accuracy     0.83


In [88]:
total_d, total_correct = check(X_test, Y_test, verbose=0)
print(f"Testing average Levenshtein score {total_d/len(X_test):8.2f}, perfect accuracy {total_correct/len(X_test):8.2f}")

Testing average Levenshtein score     5.22, perfect accuracy     0.14
