# Translation with attention

Let's do French -> English. French has multiple phrases that map to single English phrase so can't do English->French as well. E.g.,

```
Get ready.      Prépare-toi.
Get ready.      Préparez-vous.
```

## Support code

In [316]:
import pandas as pd
import numpy as np
import math
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
np.set_printoptions(precision=2, suppress=True, linewidth=3000, threshold=20000)
from typing import Sequence
import editdistance # Get Levenshtein (pip install editdistance)
import re

dtype = torch.float
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [317]:
def getvocab(strings):
    letters = [list(l) for l in strings]
    vocab = set([c for cl in letters for c in cl])
    vocab = sorted(list(vocab))
    ctoi = {c:i for i, c in enumerate(vocab)}
    return vocab, ctoi

In [318]:
def get_max_len(X):
    max_len = 0
    for x in X:
        max_len = max(max_len, len(x))
    return max_len

## Load and prepare

In [319]:
class Embedding:
    def __init__(self, input_size, embed_sz):
        self.E = torch.randn(embed_sz, input_size, device=device, dtype=torch.float64, requires_grad=True) # embedding
        self.input_size = input_size
        self.embed_sz = embed_sz
#         with torch.no_grad():
#             self.E *= 0.01
    def parameters(self): return [self.E]
    def __call__(self, x):
        if isinstance(x, int):
            return self.E[:,x].reshape(self.embed_sz, 1)
        # column E[i] is the embedding for char index i. same as multiple E.mm(onehot(i))
        return self.E[:,x]

In [320]:
class RNN:
    def __init__(self, input_sz, nhidden):
        self.W = torch.eye(nhidden,    nhidden,  device=device, dtype=torch.float64, requires_grad=True)
        self.U = torch.randn(nhidden,  input_sz, device=device, dtype=torch.float64, requires_grad=True)
        self.bx = torch.zeros(nhidden, 1,        device=device, dtype=torch.float64, requires_grad=True)
#         with torch.no_grad():
#             self.W *= 0.01
#             self.U *= 0.01
    def parameters(self): return [self.W, self.U, self.bx]
    def __call__(self, h, x):
        h = self.W@h + self.U@x + self.bx
        h = torch.tanh(h)
        return h

In [321]:
class DecoderRNN(RNN):
    def __init__(self, input_sz, context_sz, nhidden):
        super().__init__(input_sz, nhidden)
        self.C = torch.eye(nhidden,    context_sz, device=device, dtype=torch.float64, requires_grad=True)
    def parameters(self): return super().parameters()+[self.C]
    def __call__(self, h, c, x):
        h = self.W@h + self.C@c + self.U@x + self.bx
        h = torch.tanh(h)
        return h    

In [322]:
class GRU:
    def __init__(self, input_sz, nhidden, include_bias=False):
        self.Whz  = torch.eye(nhidden,   nhidden,  device=device, dtype=torch.float64, requires_grad=True)
        self.Whr  = torch.eye(nhidden,   nhidden,  device=device, dtype=torch.float64, requires_grad=True)
        self.Whh_ = torch.eye(nhidden,   nhidden,  device=device, dtype=torch.float64, requires_grad=True)
        self.Uxh_ = torch.randn(nhidden, input_sz, device=device, dtype=torch.float64, requires_grad=True)
        self.Uxz  = torch.randn(nhidden, input_sz, device=device, dtype=torch.float64, requires_grad=True)
        self.Uxr  = torch.randn(nhidden, input_sz, device=device, dtype=torch.float64, requires_grad=True)
        # if include_bias these stay 0
        self.bz   = torch.zeros(nhidden, 1,        device=device, dtype=torch.float64, requires_grad=True)
        self.br   = torch.zeros(nhidden, 1,        device=device, dtype=torch.float64, requires_grad=True)
        self.bh_  = torch.zeros(nhidden, 1,        device=device, dtype=torch.float64, requires_grad=True)
        self.include_bias = include_bias
        self.first_h_shape = None # debugging
    def parameters(self):
        p = [self.Whz, self.Whr, self.Whh_, self.Uxh_, self.Uxz, self.Uxr]
        if self.include_bias:
            p += [self.bz, self.br, self.bh_]    
        return p
    def __call__(self, h, x):
        if self.first_h_shape is None:
            self.first_h_shape = h.shape
        elif self.first_h_shape != h.shape:
            raise ValueError(f"hidden h vector changed shape in {self.__class__.__name__} from {self.first_h_shape} to {h.shape}")
        z = torch.sigmoid(self.Whz@h    + self.Uxz@x  + self.bz)
        r = torch.sigmoid(self.Whr@h    + self.Uxr@x  + self.br)
        h_ = torch.tanh(self.Whh_@(r*h) + self.Uxh_@x + self.bh_)
#         print(h.shape, z.shape, r.shape, h_.shape)
        h = torch.tanh( (1-z)*h + z*h_ )
        return h

In [323]:
class DecoderGRU(GRU):
    def __init__(self, input_sz, context_sz, nhidden, include_bias=False):
        super().__init__(input_sz, nhidden, include_bias)
        self.C = torch.eye(nhidden,    context_sz, device=device, dtype=torch.float64, requires_grad=True)
    def parameters(self): return super().parameters()+[self.C]
    def __call__(self, h, c, x):
        z = torch.sigmoid(self.Whz@h    + self.C@c + self.Uxz@x  + self.bz)
        r = torch.sigmoid(self.Whr@h    + self.C@c + self.Uxr@x  + self.br)
        h_ = torch.tanh(self.Whh_@(r*h) + self.C@c + self.Uxh_@x + self.bh_)
        h = torch.tanh( (1-z)*h + z*h_ )
        return h

In [324]:
class Linear:
    def __init__(self, input_size, output_size):
        self.V = torch.randn(output_size,  input_size, device=device, dtype=torch.float64, requires_grad=True)
        self.by = torch.zeros(output_size, 1,          device=device, dtype=torch.float64, requires_grad=True)
#         with torch.no_grad():
#             self.V *= 0.01
    def parameters(self): return [self.V, self.by]
    def __call__(self, h):
        o = self.V@h + self.by
        o = o.T # make it input_size x output_size
        return o

In [325]:
class Dropout:
    def __init__(self, p=0.0, fixed=False):
        "If fixed, reuse same mask for all future uses of this layer."
        self.p = p
        self.fixed = fixed
        self.mask = None
    def __call__(self, v):
        if self.fixed:
            if self.mask is None:
                usample = torch.empty_like(v).uniform_(0, 1) # get random value for each activation
                self.mask = (usample>self.p).int()           # get mask as those with value greater than p
            mask = self.mask
        else:
            usample = torch.empty_like(v).uniform_(0, 1) # get random value for each activation
            mask = (usample>self.p).int()                # get mask as those with value greater than p
        v = v * mask                                     # kill masked activations
        v /= 1 - self.p                                  # scale during training by 1/(1-p) to avoid scaling by p at test time
                                                         # after dropping p activations, (1-p) are left untouched, on average
        return v

In [326]:
class Transducer:
    def __init__(self, input_sz, output_sz, input_embed_sz, output_embed_sz, nhidden, 
                 dropout=0.0,
                 useGRU=False):
        self.dropout = dropout
        self.embx = Embedding(input_sz, input_embed_sz)
        self.emby = Embedding(output_sz, output_embed_sz)
        self.lin = Linear(nhidden, output_sz)
        if useGRU:
            self.encoder = GRU(input_embed_sz, nhidden)
            self.decoder = DecoderGRU(output_embed_sz, nhidden, nhidden)
        else:
            self.encoder = RNN(input_embed_sz, nhidden)
            self.decoder = DecoderRNN(output_embed_sz, nhidden, nhidden)
        
    def parameters(self):
        return self.embx.parameters()+\
               self.emby.parameters()+\
               self.lin.parameters()+\
               self.encoder.parameters()+\
               self.decoder.parameters()

    def __call__(self, x, y):
        x_dropout = Dropout(p=self.dropout, fixed=True)
        y_dropout = Dropout(p=self.dropout, fixed=True)
        z_dropout = Dropout(p=self.dropout, fixed=True)

        # ENCODER
        h = torch.zeros(nhidden, 1, device=device, dtype=torch.float64, requires_grad=False)  # reset hidden state at start of record
        for t in range(len(x)):
            embedding_step_t = self.embx(x[t])
            embedding_step_t = x_dropout(embedding_step_t)
            h = self.encoder(h, embedding_step_t)
        c = h

        # DECODER
        output = []
        loss = 0.0
        correct = 0
        h = torch.zeros(nhidden, 1, device=device, dtype=torch.float64, requires_grad=False)  # reset hidden state at start of record
#         h = c
        for t in range(len(y)-1): # don't predict next char at final '>'
            embedding_step_t = self.emby(y[t])
            embedding_step_t = y_dropout(embedding_step_t)
            h = self.decoder(h, c, embedding_step_t)
            o = self.lin(h)
            o = z_dropout(o)
            # From y we want to predict y[1:]. at y[t], predict y[t+1] using c as context vector
            loss += F.cross_entropy(o, torch.tensor([y[t+1]], device=device), reduction="sum")
            p = F.softmax(o, dim=1)
            y_pred = torch.argmax(p, dim=1).item()
            correct += y_pred==y[t+1]
            output.append(y_pred)
        return output, loss, int(correct)
    
    def predict(self, x, Y_ctoi):
        # ENCODER
        h = torch.zeros(nhidden, 1, device=device, dtype=torch.float64, requires_grad=False)  # reset hidden state at start of record
        for t in range(len(x)):
            embedding_step_t = self.embx(x[t])
            h = self.encoder(h, embedding_step_t)
        c = h

        # DECODER
        loss = 0.0
        output = []
        y_pred = Y_ctoi['<'] # begin with "start of sequence" char
        output.append(y_pred)
        h = torch.zeros(nhidden, 1, device=device, dtype=torch.float64, requires_grad=False)  # reset hidden state at start of record
#         h = c
        MAX_LENGTH = 20 # for safety
        while y_pred!=Y_ctoi['>'] and len(output)<=MAX_LENGTH:
            embedding_step_t = self.emby(y_pred)
            h = self.decoder(h, c, embedding_step_t)
            o = self.lin(h)
            p = F.softmax(o, dim=1)
            y_pred = torch.argmax(p, dim=1).item()
            output.append(y_pred)
        return output

In [327]:
with open("data/eng-fra.txt") as f:
    text = f.read().strip().lower()

# clean up, normalize
text = re.sub(r"[ \u202f\u209f\u20bf\u2009\u3000\xa0]+", " ", text)  # there are lots of space chars in unicode
text = re.sub(r"\u200b|\xad|‐|–", "-", text)  # there are lots of space chars in unicode
text = re.sub(r"‘|’", "'", text)  # there are lots of space chars in unicode
text = text.replace("‽", "?")
text = text.replace("…", "")
text = text.replace("₂", "")
# text = text.replace("\u202f", " ")
# text = text.replace("\u209f", " ")
# text = text.replace("\u20bf", " ")
text = text.replace(" !", "")
text = text.replace(" .", "")
text = re.sub(r"([.!?])", "", text)
lines = text.split("\n")

In [328]:
lines = [line for line in lines if not len(set(line).intersection({'(',')','~','€','$','%','&','/','«','»'}))]
pairs = [line.split('\t') for line in lines]

In [329]:
MAX_LENGTH = 15
pairs = [p for p in pairs if len(p[0])<=MAX_LENGTH and len(p[1])<=MAX_LENGTH]

In [330]:
FILTER = False
if FILTER:
    eng_prefixes = (
        "i am ", "i'm ",
        "he is ", "he's ",
        "she is ", "she's ",
        "you are ", "you're ",
        "we are ", "we're ",
        "they are ", "they're "
        )
    filtered_pairs = []
    for p in pairs:
        en,fr = p
        for pre in eng_prefixes:
            if en.startswith(pre):
                filtered_pairs.append(p)
                break

    pairs = filtered_pairs            

In [331]:
pairs = pairs[0:100] # testing

In [332]:
pairs = [(p[1],p[0]) for p in pairs]

In [333]:
len(pairs)

100

In [334]:
# Remove duplicates
pairs = list(dict(pairs).items())
len(pairs)

90

In [335]:
vocab = sorted(set('\n'.join(lines)))
vocab = vocab[2:] # drop \t and \n
vocab = ['<','>']+vocab # add delimiters as 0, 1
ctoi = {c:i for i, c in enumerate(vocab)}

In [336]:
len(vocab)

64

In [337]:
''.join(vocab)

'<> "\'+,-0123456789:;abcdefghijklmnopqrstuvwxyzàâçèéêëîïòôöùúûœас'

In [338]:
pairs[0:10]

[('va', 'go'),
 ('cours', 'run'),
 ('courez', 'run'),
 ('ça alors', 'wow'),
 ('au feu', 'fire'),
 ("à l'aide", 'help'),
 ('saute', 'jump'),
 ('ça suffit', 'stop'),
 ('stop', 'stop'),
 ('arrête-toi', 'stop')]

## Wrap in <...> and Numericalize

In [339]:
pairs = [(f"<{p[0]}>",f"<{p[1]}>") for p in pairs]

In [340]:
pairs[0:5]

[('<va>', '<go>'),
 ('<cours>', '<run>'),
 ('<courez>', '<run>'),
 ('<ça alors>', '<wow>'),
 ('<au feu>', '<fire>')]

In [341]:
X = []
Y = []
for p in pairs:
    fr, en = p
    X.append([ctoi[c] for c in fr])
    Y.append([ctoi[c] for c in en])

In [342]:
X[0:5]

[[0, 41, 20, 1],
 [0, 22, 34, 40, 37, 38, 1],
 [0, 22, 34, 40, 37, 24, 45, 1],
 [0, 48, 20, 2, 20, 31, 34, 37, 38, 1],
 [0, 20, 40, 2, 25, 24, 40, 1]]

In [343]:
Y[0:5]

[[0, 26, 34, 1],
 [0, 37, 40, 33, 1],
 [0, 37, 40, 33, 1],
 [0, 42, 34, 42, 1],
 [0, 25, 28, 37, 24, 1]]

## Split out validation set

In [344]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [345]:
n = len(X_train)
char_embed_sz = 10
nhidden = 300
nclasses = len(vocab) # char output vocab

print(f"{n:,d} training records, {char_embed_sz} embedding size, {nclasses} target classes, state is {nhidden}-vector")

72 training records, 10 embedding size, 64 target classes, state is 300-vector


In [346]:
def tostr(x):
    return ''.join([vocab[v] for v in x])

In [347]:
trans = Transducer(input_sz=len(ctoi),
                   output_sz=len(ctoi),
                   input_embed_sz=char_embed_sz,
                   output_embed_sz=char_embed_sz,
                   nhidden=nhidden,
                   dropout=0.0,
                   useGRU=True)
optimizer = torch.optim.Adam(trans.parameters(), lr=0.0005, weight_decay=0.0)
scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, 
                                              mode='triangular2',
                                              step_size_up=4,
                                              base_lr=0.000001, max_lr=0.0005,
                                              cycle_momentum=False)

history = []
epochs = 10
for epoch in range(1, epochs+1):
    epoch_training_loss = 0.0
    epoch_training_accur = 0.0
    total_compares = 0
    for i in range(n):
        x = X_train[i]
        y = Y_train[i]
        y_pred, loss, correct = trans(x, y)
#         if epoch==10:
#             print(f"{tostr(x)}->{tostr(y)}: {tostr(y_pred)}, {correct} correct")
        epoch_training_accur += correct
        epoch_training_loss += loss.detach().item()
        total_compares += len(y) - 1  # From "<foo>" predict and count "foo>"

        optimizer.zero_grad()
        loss.backward() # autograd computes U.grad, M.grad, ...
        optimizer.step()
        
    epoch_training_accur /= total_compares
    epoch_training_loss /= total_compares
    
    print(f"Epoch {epoch:3d} training loss {epoch_training_loss:8.3f}   accur {epoch_training_accur:7.4f}   LR {scheduler.get_last_lr()[0]:7.6f}")
    scheduler.step()    

Epoch   1 training loss   21.657   accur  0.0059   LR 0.000001
Epoch   2 training loss   16.619   accur  0.0533   LR 0.000126
Epoch   3 training loss    9.126   accur  0.1775   LR 0.000251
Epoch   4 training loss    4.766   accur  0.4694   LR 0.000375
Epoch   5 training loss    2.604   accur  0.6154   LR 0.000500
Epoch   6 training loss    1.288   accur  0.8008   LR 0.000375
Epoch   7 training loss    0.820   accur  0.8797   LR 0.000251
Epoch   8 training loss    0.569   accur  0.9112   LR 0.000126
Epoch   9 training loss    0.479   accur  0.9270   LR 0.000001
Epoch  10 training loss    0.481   accur  0.9290   LR 0.000063


In [348]:
# def same(a,b):
#     return sum(c1==c2 for c1,c2 in zip(a,b))

def check(X,Y,verbose=False):
    "Use Levenshtein to measure how close output predictions are to truth."
    with torch.no_grad():
        valid_accur = 0
        total_compares = 0
        total_correct = 0
        total_d = 0
        for i in range(len(X)):
            x = X[i]
            y = Y[i]
            y_pred = trans.predict(x, ctoi)
            total_compares += len(y) - 1 # From "<foo>" predict "foo>" but don't count last '>' for metrics
            total_correct += tostr(y)==tostr(y_pred)
            d = editdistance.eval(tostr(y),tostr(y_pred))
            total_d += d
            if verbose:
                print(f"{tostr(x):20s} : {tostr(y)}")
                print(f"{'':20s} : {tostr(y_pred):20s} Levenshtein {d} out of {len(y)}")
    return total_d, total_correct

In [349]:
total_d, total_correct = check(X_train, Y_train)
print(f"Training average Levenshtein score {total_d/len(X_train):8.2f}, perfect accuracy {total_correct/len(X_train):8.2f}")

Training average Levenshtein score     1.42, perfect accuracy     0.71


In [350]:
total_d, total_correct = check(X_test, Y_test, verbose=True)
print(f"Testing average Levenshtein score {total_d/len(X_test):8.2f}, perfect accuracy {total_correct/len(X_test):8.2f}")

<j'ai perdu>         : <i lost>
                     : <be cool>            Levenshtein 5 out of 8
<sois gentille>      : <be nice>
                     : <be nice>            Levenshtein 0 out of 9
<c'est exclu>        : <no way>
                     : <be cok>             Levenshtein 5 out of 8
<sois juste>         : <be fair>
                     : <come n>             Levenshtein 7 out of 9
<soyez calmes>       : <be calm>
                     : <go away>            Levenshtein 6 out of 9
<j'essaye>           : <i try>
                     : <be cok>             Levenshtein 5 out of 7
<au feu>             : <fire>
                     : <be coo>             Levenshtein 6 out of 6
<fantastique>        : <awesome>
                     : <aûtt en>            Levenshtein 6 out of 9
<à l'aide>           : <help>
                     : <go away>            Levenshtein 7 out of 6
<demande à tom>      : <ask tom>
                     : <o aû aûraly>        Levenshtein 10 out of 9
<impossib