# Translation with attention

Let's do French -> English. French has multiple phrases that map to single English phrase so can't do English->French as well. E.g.,

```
Get ready.      Prépare-toi.
Get ready.      Préparez-vous.
```

## Support code

In [1]:
import pandas as pd
import numpy as np
import math
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
np.set_printoptions(precision=2, suppress=True, linewidth=3000, threshold=20000)
from typing import Sequence
import re

dtype = torch.float
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [2]:
def getvocab(strings):
    letters = [list(l) for l in strings]
    vocab = set([c for cl in letters for c in cl])
    vocab = sorted(list(vocab))
    ctoi = {c:i for i, c in enumerate(vocab)}
    return vocab, ctoi

In [3]:
def get_max_len(X):
    max_len = 0
    for x in X:
        max_len = max(max_len, len(x))
    return max_len

In [4]:
class Embedding:
    def __init__(self, input_size, embed_sz):
        self.E = torch.randn(embed_sz, input_size, device=device, dtype=torch.float64, requires_grad=True) # embedding
        self.input_size = input_size
        self.embed_sz = embed_sz
        with torch.no_grad():
            self.E *= 0.01
    def parameters(self): return [self.E]
    def __call__(self, x):
        if isinstance(x, int):
            return self.E[:,x].reshape(self.embed_sz, 1)
        # column E[i] is the embedding for char index i. same as multiple E.mm(onehot(i))
        return self.E[:,x]

In [5]:
class RNN:
    def __init__(self, input_sz, nhidden):
        self.W = torch.eye(nhidden,    nhidden,  device=device, dtype=torch.float64, requires_grad=True)
        self.U = torch.randn(nhidden,  input_sz, device=device, dtype=torch.float64, requires_grad=True)
        self.bx = torch.zeros(nhidden, 1,        device=device, dtype=torch.float64, requires_grad=True)
        with torch.no_grad():
            self.W *= 0.01
            self.U *= 0.01
    def parameters(self): return [self.W, self.U, self.bx]
    def __call__(self, h, x):
        h = self.W@h + self.U@x + self.bx
        h = torch.tanh(h)
        return h

In [6]:
class GRU:
    def __init__(self, input_sz, nhidden):
        self.Whz  = torch.eye(nhidden,   nhidden,  device=device, dtype=torch.float64, requires_grad=True)
        self.Whr  = torch.eye(nhidden,   nhidden,  device=device, dtype=torch.float64, requires_grad=True)
        self.Whh_ = torch.eye(nhidden,   nhidden,  device=device, dtype=torch.float64, requires_grad=True)
        self.Uxh_ = torch.randn(nhidden, input_sz, device=device, dtype=torch.float64, requires_grad=True)
        self.Uxz  = torch.randn(nhidden, input_sz, device=device, dtype=torch.float64, requires_grad=True)
        self.Uxr  = torch.randn(nhidden, input_sz, device=device, dtype=torch.float64, requires_grad=True)
    def parameters(self):
        return [self.Whz, self.Whr, self.Whh_, self.Uxh_, self.Uxz, self.Uxr]
    
    def __call__(self, h, x):
        z = torch.sigmoid(self.Whz@h + self.Uxz@x)
        r = torch.sigmoid(self.Whr@h + self.Uxr@x)
        h_ = torch.tanh(self.Whh_@(r*h) + self.Uxh_@x)
        h = torch.tanh( (1-z)*h + z*h_ )
        return h

In [7]:
class Linear:
    def __init__(self, input_size, output_size):
        self.V = torch.randn(output_size,  input_size, device=device, dtype=torch.float64, requires_grad=True)
        self.by = torch.zeros(output_size, 1,          device=device, dtype=torch.float64, requires_grad=True)
        with torch.no_grad():
            self.V *= 0.01
    def parameters(self): return [self.V, self.by]
    def __call__(self, h):
        o = self.V@h + self.by
        o = o.T # make it input_size x output_size
        return o

## Load and prepare

In [8]:
with open("data/eng-fra.txt") as f:
    text = f.read().strip().lower()

# clean up, normalize
text = re.sub(r"[ \u202f\u209f\u20bf\u2009\u3000\xa0]+", " ", text)  # there are lots of space chars in unicode
text = re.sub(r"\u200b|\xad|‐|–", "-", text)  # there are lots of space chars in unicode
text = re.sub(r"‘|’", "'", text)  # there are lots of space chars in unicode
text = text.replace("‽", "?")
text = text.replace("…", "")
text = text.replace("₂", "")
# text = text.replace("\u202f", " ")
# text = text.replace("\u209f", " ")
# text = text.replace("\u20bf", " ")
text = text.replace(" !", "")
text = text.replace(" .", "")
text = re.sub(r"([.!?])", "", text)
lines = text.split("\n")

In [9]:
lines = [line for line in lines if not len(set(line).intersection({'(',')','~','€','$','%','&','/','«','»'}))]
pairs = [line.split('\t') for line in lines]

In [10]:
MAX_LENGTH = 15
pairs = [p for p in pairs if len(p[0])<=MAX_LENGTH and len(p[1])<=MAX_LENGTH]

In [11]:
eng_prefixes = (
    "i am ", "i'm ",
    "he is ", "he's ",
    "she is ", "she's ",
    "you are ", "you're ",
    "we are ", "we're ",
    "they are ", "they're "
    )
filtered_pairs = []
for p in pairs:
    en,fr = p
    for pre in eng_prefixes:
        if en.startswith(pre):
            filtered_pairs.append(p)
            break
            
pairs = filtered_pairs            

In [12]:
pairs = pairs[0:2000] # testing

In [13]:
pairs = [(p[1],p[0]) for p in pairs]

In [14]:
len(pairs)

726

In [15]:
# Remove duplicates
pairs = list(dict(pairs).items())
len(pairs)

613

In [16]:
vocab = sorted(set('\n'.join(lines)))
vocab = vocab[2:] # drop \t and \n
vocab = ['<','>']+vocab # add delimiters as 0, 1
ctoi = {c:i for i, c in enumerate(vocab)}

In [17]:
len(vocab)

64

In [18]:
''.join(vocab)

'<> "\'+,-0123456789:;abcdefghijklmnopqrstuvwxyzàâçèéêëîïòôöùúûœас'

In [19]:
pairs[0:10]

[("j'ai 19 ans", "i'm 19"),
 ('je vais bien', "i'm doing well"),
 ('ça va', "i'm fine"),
 ('je suis gras', 'i am fat'),
 ('je suis gros', "i'm fat"),
 ('je suis touché', "i'm touched"),
 ('je suis touchée', "i'm touched"),
 ('je suis malade', 'i am sick'),
 ('je suis triste', "i'm sad"),
 ('je suis timide', "i'm timid")]

## Wrap in <...> and Numericalize

In [20]:
pairs = [(f"<{p[0]}>",f"<{p[1]}>") for p in pairs]

In [21]:
pairs[0:5]

[("<j'ai 19 ans>", "<i'm 19>"),
 ('<je vais bien>', "<i'm doing well>"),
 ('<ça va>', "<i'm fine>"),
 ('<je suis gras>', '<i am fat>'),
 ('<je suis gros>', "<i'm fat>")]

In [22]:
X = []
Y = []
for p in pairs:
    fr, en = p
    X.append([ctoi[c] for c in fr])
    Y.append([ctoi[c] for c in en])

In [23]:
X[0:5]

[[0, 29, 4, 20, 28, 2, 9, 17, 2, 20, 33, 38, 1],
 [0, 29, 24, 2, 41, 20, 28, 38, 2, 21, 28, 24, 33, 1],
 [0, 48, 20, 2, 41, 20, 1],
 [0, 29, 24, 2, 38, 40, 28, 38, 2, 26, 37, 20, 38, 1],
 [0, 29, 24, 2, 38, 40, 28, 38, 2, 26, 37, 34, 38, 1]]

In [24]:
Y[0:5]

[[0, 28, 4, 32, 2, 9, 17, 1],
 [0, 28, 4, 32, 2, 23, 34, 28, 33, 26, 2, 42, 24, 31, 31, 1],
 [0, 28, 4, 32, 2, 25, 28, 33, 24, 1],
 [0, 28, 2, 20, 32, 2, 25, 20, 39, 1],
 [0, 28, 4, 32, 2, 25, 20, 39, 1]]

## Split out validation set

In [25]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [26]:
n = len(X_train)
char_embed_sz = 20
nhidden = 300
nclasses = len(vocab) # char output vocab

print(f"{n:,d} training records, {char_embed_sz} embedding size, {nclasses} target classes, state is {nhidden}-vector")

490 training records, 20 embedding size, 64 target classes, state is 300-vector


In [33]:
class Transducer:
    def __init__(self, embed_sz, nhidden, target_sz, useGRU=False):
        self.embx = Embedding(len(ctoi), embed_sz)
        self.emby = Embedding(len(ctoi), embed_sz)
        if useGRU:
            self.rnn = GRU(embed_sz, nhidden)
#             self.rnn2 = GRU(embed_sz+nhidden, nhidden)
            self.rnn2 = GRU(embed_sz, nhidden)
            self.lin = Linear(nhidden, target_sz)
        else:
            self.rnn = RNN(embed_sz, nhidden)
#             self.rnn2 = RNN(embed_sz+nhidden, nhidden)
            self.rnn2 = RNN(embed_sz, nhidden)
            self.lin = Linear(nhidden, target_sz)
        
    def parameters(self):
        return self.embx.parameters()+\
               self.emby.parameters()+\
               self.rnn.parameters()+\
               self.rnn2.parameters()+\
               self.lin.parameters()

    def __call__(self, x, y):
        # ENCODER
        h = torch.zeros(nhidden, 1, device=device, dtype=torch.float64, requires_grad=False)  # reset hidden state at start of record
        for t in range(len(x)):
            embedding_step_t = self.embx(x[t])
            h = self.rnn(h, embedding_step_t)
        c = h

        # DECODER
        output = []
        loss = 0.0
        correct = 0
#         h = torch.zeros(nhidden, 1, device=device, dtype=torch.float64, requires_grad=False)  # reset hidden state at start of record
        h = c
        for t in range(len(y)-1): # don't predict next char at final '>'
            embedding_step_t = self.emby(y[t])
#             h = self.rnn2(h, torch.cat([embedding_step_t,c]))
            h = self.rnn2(h, embedding_step_t)
            o = self.lin(h)
            # From y we want to predict y[1:]. at y[t], predict y[t+1] using c as context vector
            loss += F.cross_entropy(o, torch.tensor([y[t+1]]))

            p = F.softmax(o[0], dim=0)
            y_pred = torch.argmax(p).item()
            correct += y_pred==y[t+1]
            output.append(y_pred)
    
        if correct>0:
            correct -= 1 # don't count getting final '>' correct
        return output, loss, int(correct)
    
    def predict(self, x):
        # ENCODER
        h = torch.zeros(nhidden, 1, device=device, dtype=torch.float64, requires_grad=False)  # reset hidden state at start of record
        for t in range(len(x)):
            embedding_step_t = self.embx(x[t])
            h = self.rnn(h, embedding_step_t)
        c = h

        # DECODER
        loss = 0.0
        output = []
        y_pred = ctoi['<'] # begin with "start of sequence" char
        output.append(y_pred)
#         h = torch.zeros(nhidden, 1, device=device, dtype=torch.float64, requires_grad=False)  # reset hidden state at start of record
        h = c
        while y_pred!=ctoi['>'] and len(output)<=MAX_LENGTH:
            embedding_step_t = self.emby(y_pred)
#             h = self.rnn2(h, torch.cat([embedding_step_t,c]))
            h = self.rnn2(h, embedding_step_t)
            o = self.lin(h)
            p = F.softmax(o, dim=0)
            y_pred = torch.argmax(p).item()
            output.append(y_pred)
        return output

In [34]:
def tostr(x):
    return ''.join([vocab[v] for v in x])

In [35]:
trans = Transducer(char_embed_sz, nhidden, nclasses, useGRU=False)
optimizer = torch.optim.Adam(trans.parameters(), lr=0.0005, weight_decay=0.0)
scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, 
                                              mode='triangular2',
                                              step_size_up=4,
                                              base_lr=0.00001, max_lr=0.0005,
                                              cycle_momentum=False)

history = []
epochs = 20
for epoch in range(1, epochs+1):
    epoch_training_loss = 0.0
    epoch_training_accur = 0.0
    total_compares = 0
    for i in range(n):
        x = X_train[i]
        y = Y_train[i]
        y_pred, loss, correct = trans(x, y)
#         if epoch==10:
#             print(f"{tostr(x)}->{tostr(y)}: {tostr(y_pred)}, {correct} correct")
        epoch_training_accur += correct
        epoch_training_loss += loss.detach().item()
        total_compares += len(y) - 2  # From "<foo>" predict "foo>" but don't count last '>' for metrics

        optimizer.zero_grad()
        loss.backward() # autograd computes U.grad, M.grad, ...
        optimizer.step()
        
#         if t % bptt == 0 and t > 0:
#             optimizer.zero_grad()
#             loss.backward() # autograd computes U.grad, M.grad, ...
#             optimizer.step()
#             epoch_training_loss += loss.detach().item()
#             loss = 0
#             H = H.detach() # no longer consider previous computations

    epoch_training_accur /= total_compares
    epoch_training_loss /= total_compares
    
    print(f"Epoch {epoch:3d} training loss {epoch_training_loss:8.3f}   accur {epoch_training_accur:7.4f}   LR {scheduler.get_last_lr()[0]:7.6f}")
    scheduler.step()    

call <he's a gambler>
call <you are good>
call <they're armed>
call <i'm ugly>
call <you're clever>
call <you're tough>
call <they're talking>
call <you're biased>
call <he's your son>
call <i am exhausted>
call <i'm famous>
call <you're careful>
call <she is dead>
call <i'm drunk>
call <you're powerful>
call <i'm frantic>
call <he's so young>
call <you're reliable>
call <i'm fine>
call <i'm broke>
call <you're upset>
call <you're crafty>
call <i'm cracking up>
call <i'm cleaned out>
call <i'm begging you>
call <you are drunk>
call <he is drunk>
call <he's studying>
call <i'm normal>
call <i'm shaken>
call <they're gone>
call <you're young>
call <you're invited>
call <you're welcome>
call <he's my age>
call <we are here>
call <i'm back>
call <they're rich>
call <you're rude>
call <he is busy>
call <i'm thrilled>
call <you're cruel>
call <i'm used to it>
call <i'm nearsighted>
call <he is sick>
call <i'm powerless>
call <i'm azerbaijani>
call <i'm back>
call <i am exhausted>
call <he is

KeyboardInterrupt: 

In [30]:
def predict(self,x):
    # ENCODER
    h = torch.zeros(nhidden, 1, device=device, dtype=torch.float64, requires_grad=False)  # reset hidden state at start of record
    for t in range(len(x)):
        embedding_step_t = self.embx(x[t])
        h = self.rnn(h, embedding_step_t)
    c = h

    # DECODER
    loss = 0.0
    output = []
    y_pred = ctoi['<'] # begin with "start of sequence" char
    output.append(y_pred)
#     h = torch.zeros(nhidden, 1, device=device, dtype=torch.float64, requires_grad=False)  # reset hidden state at start of record
    h = c
    while y_pred!=ctoi['>'] and len(output)<=MAX_LENGTH:
        embedding_step_t = self.emby(y_pred)
        h = self.rnn2(h, embedding_step_t)
#         h = self.rnn2(h, torch.cat([embedding_step_t,c]))
        o = self.lin(h)
        p = F.softmax(o[0], dim=0)
        y_pred = torch.argmax(p).item()
        output.append(y_pred)
    return output

with torch.no_grad():
    valid_accur = 0
    total_compares = 0
    for i in range(len(X_train)):
        x = X_train[i]
        y = Y_train[i]
#     for i in range(len(X_test)):
#         x = X_test[i]
#         y = Y_test[i]
        y_pred = predict(trans,x)
#         total_compares += len(y) - 2 # From "<foo>" predict "foo>" but don't count last '>' for metrics
        correct = 0
        print(f"{tostr(x)}->{tostr(y)}: {tostr(y_pred)}, {correct}/{len(y) - 1} correct")

    print(f"Valid loss {valid_loss:8.2f}   accur {valid_accur:7.4f}")

<il est joueur>-><he's a gambler>: <i'm sare>, 0/15 correct
<tu es bon>-><you are good>: <i'm sare>, 0/13 correct
<ils sont armés>-><they're armed>: <i'm sare>, 0/14 correct
<je suis laide>-><i'm ugly>: <i'm sare>, 0/9 correct
<tu es malin>-><you're clever>: <i'm sare>, 0/14 correct
<tu es dure>-><you're tough>: <i'm sare>, 0/13 correct
<elles parlent>-><they're talking>: <i'm sare>, 0/16 correct
<tu es partiale>-><you're biased>: <i'm sare>, 0/14 correct
<c'est ton fils>-><he's your son>: <i'm sare>, 0/14 correct
<je suis épuisé>-><i am exhausted>: <i'm sare>, 0/15 correct
<je suis connu>-><i'm famous>: <i'm sare>, 0/11 correct
<tu es prudente>-><you're careful>: <i'm sare>, 0/15 correct
<elle est morte>-><she is dead>: <i'm sare>, 0/12 correct
<je suis soûl>-><i'm drunk>: <i'm sare>, 0/10 correct
<tu es puissant>-><you're powerful>: <i'm sare>, 0/16 correct
<je suis affolé>-><i'm frantic>: <i'm sare>, 0/12 correct
<il est si jeune>-><he's so young>: <i'm sare>, 0/14 correct
<tu es fi

NameError: name 'valid_loss' is not defined