# Translation vectorized

Let's do French -> English. French has multiple phrases that map to single English phrase so can't do English->French as well. E.g.,

```
Get ready.      Prépare-toi.
Get ready.      Préparez-vous.
```

## Support code

In [1]:
import pandas as pd
import numpy as np
import math
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
np.set_printoptions(precision=2, suppress=True, linewidth=3000, threshold=20000)
from typing import Sequence
import editdistance # Get Levenshtein (pip install editdistance)
import re

dtype = torch.float
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [2]:
def getvocab(strings):
    letters = [list(l) for l in strings]
    vocab = set([c for cl in letters for c in cl])
    vocab = sorted(list(vocab))
    ctoi = {c:i for i, c in enumerate(vocab)}
    return vocab, ctoi

In [3]:
def get_max_len(X):
    max_len = 0
    for x in X:
        max_len = max(max_len, len(x))
    return max_len

In [4]:
class Embedding:
    def __init__(self, input_size, embed_sz):
        self.E = torch.randn(embed_sz, input_size, device=device, dtype=torch.float64, requires_grad=True) # embedding
        self.input_size = input_size
        self.embed_sz = embed_sz
#         with torch.no_grad():
#             self.E *= 0.01
    def parameters(self): return [self.E]
    def __call__(self, x):
        if isinstance(x, int) or (x.dim()==0 or isinstance(x, torch.Tensor) and x.dim()==1 and len(x)==1):
            batch_size = 1
        elif isinstance(x, torch.Tensor) and x.dim()==1:
            batch_size = x.shape[0]
        if isinstance(x, torch.Tensor): x.dim()==1
        
        # column E[i] is the embedding for char index i. same as multiple E.mm(onehot(i))
        return self.E[:,x].reshape(self.embed_sz, batch_size)

In [5]:
class RNN:
    def __init__(self, input_sz, nhidden):
        self.W = torch.eye(nhidden,    nhidden,  device=device, dtype=torch.float64, requires_grad=True)
        self.U = torch.randn(nhidden,  input_sz, device=device, dtype=torch.float64, requires_grad=True)
        self.bx = torch.zeros(nhidden, 1,        device=device, dtype=torch.float64, requires_grad=True)
#         with torch.no_grad():
#             self.W *= 0.01
#             self.U *= 0.01
    def parameters(self): return [self.W, self.U, self.bx]
    def __call__(self, h, x):
        h = self.W@h + self.U@x + self.bx
        h = torch.tanh(h)
        return h

In [6]:
class DecoderRNN(RNN):
    def __init__(self, input_sz, context_sz, nhidden):
        super().__init__(input_sz, nhidden)
        self.C = torch.eye(nhidden,    context_sz, device=device, dtype=torch.float64, requires_grad=True)
    def parameters(self): return super().parameters()+[self.C]
    def __call__(self, h, c, x):
        h = self.W@h + self.C@c + self.U@x + self.bx
        h = torch.tanh(h)
        return h    

In [7]:
class GRU:
    def __init__(self, input_sz, nhidden, include_bias=False):
        self.Whz  = torch.eye(nhidden,   nhidden,  device=device, dtype=torch.float64, requires_grad=True)
        self.Whr  = torch.eye(nhidden,   nhidden,  device=device, dtype=torch.float64, requires_grad=True)
        self.Whh_ = torch.eye(nhidden,   nhidden,  device=device, dtype=torch.float64, requires_grad=True)
        self.Uxh_ = torch.randn(nhidden, input_sz, device=device, dtype=torch.float64, requires_grad=True)
        self.Uxz  = torch.randn(nhidden, input_sz, device=device, dtype=torch.float64, requires_grad=True)
        self.Uxr  = torch.randn(nhidden, input_sz, device=device, dtype=torch.float64, requires_grad=True)
        # if include_bias these stay 0
        self.bz   = torch.zeros(nhidden, 1,        device=device, dtype=torch.float64, requires_grad=True)
        self.br   = torch.zeros(nhidden, 1,        device=device, dtype=torch.float64, requires_grad=True)
        self.bh_  = torch.zeros(nhidden, 1,        device=device, dtype=torch.float64, requires_grad=True)
        self.include_bias = include_bias
    def parameters(self):
        p = [self.Whz, self.Whr, self.Whh_, self.Uxh_, self.Uxz, self.Uxr]
        if self.include_bias:
            p += [self.bz, self.br, self.bh_]    
        return p
    def __call__(self, h, x):
        z = torch.sigmoid(self.Whz@h    + self.Uxz@x  + self.bz)
        r = torch.sigmoid(self.Whr@h    + self.Uxr@x  + self.br)
        h_ = torch.tanh(self.Whh_@(r*h) + self.Uxh_@x + self.bh_)
#         print(h.shape, z.shape, r.shape, h_.shape)
        h = torch.tanh( (1-z)*h + z*h_ )
        return h

In [8]:
class DecoderGRU(GRU):
    def __init__(self, input_sz, context_sz, nhidden, include_bias=False):
        super().__init__(input_sz, nhidden, include_bias)
        self.C = torch.eye(nhidden,    context_sz, device=device, dtype=torch.float64, requires_grad=True)
    def parameters(self): return super().parameters()+[self.C]
    def __call__(self, h, c, x):
        z = torch.sigmoid(self.Whz@h    + self.C@c + self.Uxz@x  + self.bz)
        r = torch.sigmoid(self.Whr@h    + self.C@c + self.Uxr@x  + self.br)
        h_ = torch.tanh(self.Whh_@(r*h) + self.C@c + self.Uxh_@x + self.bh_)
        h = torch.tanh( (1-z)*h + z*h_ )
        return h

In [9]:
class Linear:
    def __init__(self, input_size, output_size):
        self.V = torch.randn(output_size,  input_size, device=device, dtype=torch.float64, requires_grad=True)
        self.by = torch.zeros(output_size, 1,          device=device, dtype=torch.float64, requires_grad=True)
#         with torch.no_grad():
#             self.V *= 0.01
    def parameters(self): return [self.V, self.by]
    def __call__(self, h):
        o = self.V@h + self.by
        o = o.T # make it input_size x output_size
        return o

In [10]:
class Dropout:
    def __init__(self, p=0.0, fixed=False):
        """
        If fixed, reuse same mask for all future uses of this layer.
        Assumes v columns are the layer activations. If batch size is 1, then this will be a column vector.
        Same column knockout used for each column in incoming matrix and future invocations if fixed.
        If not fixed, different knockout mask used for each column.
        """
        self.p = p
        self.fixed = fixed
        self.mask = None
    def __call__(self, v):
        return v
        """
        Column(s) are activation vectors. Get a new column mask and knockout elements with
        it for each column (unless fixed).
        """
        if isinstance(v, list):
            v = torch.tensor(v, device=device)

        if self.fixed and self.mask is None:
            mast = self.mask = (usample>self.p).int()

        usample = torch.empty_like(v).uniform_(0, 1)     # get random value for each activation matrix element
        mask = (usample>self.p).int()                    # get boolean mask as "those with value greater than p"
        v = v * mask                                     # kill masked activations
        v /= 1 - self.p                                  # scale during training by 1/(1-p) to avoid scaling by p at test time
                                                         # after dropping p activations, (1-p) are left untouched, on average
        return v            

In [11]:
a = torch.tensor([range(8),range(20,28)], dtype=torch.float64).T
a

tensor([[ 0., 20.],
        [ 1., 21.],
        [ 2., 22.],
        [ 3., 23.],
        [ 4., 24.],
        [ 5., 25.],
        [ 6., 26.],
        [ 7., 27.]], dtype=torch.float64)

In [12]:
a[:,1]

tensor([20., 21., 22., 23., 24., 25., 26., 27.], dtype=torch.float64)

## Load and prepare

In [13]:
with open("data/eng-fra.txt") as f:
    text = f.read().strip().lower()

# clean up, normalize
text = re.sub(r"[ \u202f\u209f\u20bf\u2009\u3000\xa0]+", " ", text)  # there are lots of space chars in unicode
text = re.sub(r"\u200b|\xad|‐|–", "-", text)  # there are lots of space chars in unicode
text = re.sub(r"‘|’", "'", text)  # there are lots of space chars in unicode
text = text.replace("‽", "?")
text = text.replace("…", "")
text = text.replace("₂", "")
# text = text.replace("\u202f", " ")
# text = text.replace("\u209f", " ")
# text = text.replace("\u20bf", " ")
text = text.replace(" !", "")
text = text.replace(" .", "")
text = re.sub(r"([.!?])", "", text)
lines = text.split("\n")

In [14]:
lines = [line for line in lines if not len(set(line).intersection({'(',')','~','€','$','%','&','/','«','»'}))]
pairs = [line.split('\t') for line in lines]
len(pairs)

135614

In [15]:
MAX_LENGTH = 18
pairs = [p for p in pairs if len(p[0])<=MAX_LENGTH and len(p[1])<=MAX_LENGTH]
len(pairs)

9748

In [16]:
FILTER = False
if FILTER:
    eng_prefixes = (
        "i am ", "i'm ",
        "he is ", "he's ",
        "she is ", "she's ",
        "you are ", "you're ",
        "we are ", "we're ",
        "they are ", "they're "
        )
    filtered_pairs = []
    for p in pairs:
        en,fr = p
        for pre in eng_prefixes:
            if en.startswith(pre):
                filtered_pairs.append(p)
                break

    pairs = filtered_pairs            

In [17]:
pairs = pairs[0:1000] # testing

In [18]:
pairs = [(p[1],p[0]) for p in pairs]

In [19]:
len(pairs)

1000

In [20]:
# Remove duplicates
pairs = list(dict(pairs).items())
len(pairs)

884

In [21]:
vocab = sorted(set('\n'.join(lines)))
vocab = vocab[2:] # drop \t and \n
vocab = ['<','>']+vocab # add delimiters as 0, 1
ctoi = {c:i for i, c in enumerate(vocab)}

In [22]:
len(vocab)

64

In [23]:
''.join(vocab)

'<> "\'+,-0123456789:;abcdefghijklmnopqrstuvwxyzàâçèéêëîïòôöùúûœас'

In [24]:
pairs[0:10]

[('va', 'go'),
 ('cours', 'run'),
 ('courez', 'run'),
 ('ça alors', 'wow'),
 ('au feu', 'fire'),
 ("à l'aide", 'help'),
 ('saute', 'jump'),
 ('ça suffit', 'stop'),
 ('stop', 'stop'),
 ('arrête-toi', 'stop')]

In [25]:
a = torch.tensor(range(5)).reshape(-1,1)
b = torch.tensor(range(5,10)).reshape(-1,1)
torch.cat([a,b], dim=1)

tensor([[0, 5],
        [1, 6],
        [2, 7],
        [3, 8],
        [4, 9]])

## Wrap in <...> and Numericalize

In [26]:
pairs = [(f"{p[0]}",f"<{p[1]}>") for p in pairs]  # X doesn't need <...> brackets
pairs[0:5]

[('va', '<go>'),
 ('cours', '<run>'),
 ('courez', '<run>'),
 ('ça alors', '<wow>'),
 ('au feu', '<fire>')]

In [27]:
pairs[0:5]

[('va', '<go>'),
 ('cours', '<run>'),
 ('courez', '<run>'),
 ('ça alors', '<wow>'),
 ('au feu', '<fire>')]

In [28]:
# numericalize and left pad
X = torch.zeros(len(pairs), MAX_LENGTH, device=device, dtype=torch.long) # zero implies padding
for i,p in enumerate(pairs):
    fr, en = p
    pad = MAX_LENGTH - len(fr)
    for j in range(len(fr)):
        X[i,j+pad] = ctoi[fr[j]]
X[5:10]

tensor([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 46,  2, 31,  4, 20, 28, 23, 24],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 38, 20, 40, 39, 24],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0, 48, 20,  2, 38, 40, 25, 25, 28, 39],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 38, 39, 34, 35],
        [ 0,  0,  0,  0,  0,  0,  0,  0, 20, 37, 37, 51, 39, 24,  7, 39, 34, 28]],
       device='cuda:0')

In [29]:
Y = []
for i,p in enumerate(pairs):
    fr, en = p
    pad = MAX_LENGTH - len(en) + 2 # include <...>
    Y.append([ctoi[d] for d in en]+[ctoi['>']]*pad)  # pad with "end of string" symbols '>'
Y = torch.tensor(Y, device=device)
Y[0:5]

tensor([[ 0, 26, 34,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
          1,  1],
        [ 0, 37, 40, 33,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
          1,  1],
        [ 0, 37, 40, 33,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
          1,  1],
        [ 0, 42, 34, 42,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
          1,  1],
        [ 0, 25, 28, 37, 24,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
          1,  1]], device='cuda:0')

## Split out validation set

In [30]:
char_embed_sz = 10
nhidden = 512
nclasses = len(vocab) # char output vocab
batch_size = 16

n = len(X)
n = batch_size * n//batch_size
print(f"before {len(X)}, after {n}")
X, Y = X[:n], Y[:n]

before 884, after 884


In [31]:
ridx = torch.randperm(len(X))
# shuffle
X = X[ridx]
Y = Y[ridx]
# split
ntrain = int(0.8 * len(X))
X_train, X_test = X[:ntrain], X[ntrain:]
Y_train, Y_test = Y[:ntrain], Y[ntrain:]

In [32]:
def tostr(x):
    s = ''.join([vocab[v] for v in x])
    if '>' in s:
        i = s.index('>')
        return s[0:i+1]
    return s

In [33]:
class Transducer:
    def __init__(self, input_sz, output_sz, input_embed_sz, output_embed_sz, nhidden, 
                 dropout=0.0,
                 useGRU=False):
        self.dropout = dropout
        self.embx = Embedding(input_sz, input_embed_sz)
        self.emby = Embedding(output_sz, output_embed_sz)
        self.lin = Linear(nhidden, output_sz)
        if useGRU:
            self.encoder = GRU(input_embed_sz, nhidden)
            self.decoder = DecoderGRU(output_embed_sz, nhidden, nhidden)
        else:
            self.encoder = RNN(input_embed_sz, nhidden)
            self.decoder = DecoderRNN(output_embed_sz, nhidden, nhidden)
        
    def parameters(self):
        return self.embx.parameters()+\
               self.emby.parameters()+\
               self.lin.parameters()+\
               self.encoder.parameters()+\
               self.decoder.parameters()

    def __call__(self, x, y):
        encoder_h_dropout = Dropout(p=self.dropout, fixed=False)
        decoder_h_dropout = Dropout(p=self.dropout, fixed=False)
        
        x_dropout = Dropout(p=self.dropout, fixed=False)
        y_dropout = Dropout(p=self.dropout, fixed=False)
        z_dropout = Dropout(p=self.dropout, fixed=False)
        
        if isinstance(x, list):
            x = torch.tensor(x, device=device)
        if isinstance(y, list):
            y = torch.tensor(y, device=device)
        
        assert x.dim()==1 or x.dim()==2
        assert y.dim()==1 or y.dim()==2
        
        if x.dim()==1:
            batch_size = 1
            x = x.reshape(1,-1)
        else:
            batch_size = x.shape[0]
        if y.dim()==1:
            y = y.reshape(1,-1)
            
        # ENCODER
        h = torch.zeros(nhidden, batch_size, device=device, dtype=torch.float64, requires_grad=False)  # reset hidden state at start of record
        for t in range(x.shape[1]):
            embedding_step_t = self.embx(x[:,t])
            embedding_step_t = x_dropout(embedding_step_t)
#             print(embedding_step_t.shape, embedding_step_t)
            h = self.encoder(h, embedding_step_t)
            h = encoder_h_dropout(h)
        c = h

        # DECODER
        output = []
        loss = 0.0
        correct = 0
        h = torch.zeros(nhidden, batch_size, device=device, dtype=torch.float64, requires_grad=False)  # reset hidden state at start of record
        for t in range(y.shape[1]-1): # don't predict next char at final '>'
            embedding_step_t = self.emby(y[:,t])
            embedding_step_t = y_dropout(embedding_step_t)
            h = self.decoder(h, c, embedding_step_t)
            h = decoder_h_dropout(h)
            o = self.lin(h)
#             print(embedding_step_t.shape, o.shape, torch.tensor([y[t+1]], device=device).shape)
            o = z_dropout(o)
            # From y we want to predict y[1:]. at y[t], predict y[t+1] using c as context vector
            y_true = torch.tensor(y[:,t+1], device=device).reshape(batch_size)
            loss += F.cross_entropy(o, y_true, reduction="sum")
            p = F.softmax(o, dim=1)
            y_pred = torch.argmax(p, dim=1) # y_pred has prediction for each record in batch
            correct += torch.sum(y_pred==y[:,t+1])
            output.append(y_pred.reshape(-1,1))
        output = torch.cat(output, dim=1) # should be batch_size by (columns(y)-1)
        return output, loss, int(correct)
    
    def predict(self, x, y=None):
        "if y not none, compute loss, accuracy"
        with torch.no_grad():
            if isinstance(x, list):
                x = torch.tensor(x, device=device)

            assert x.dim()==1 or x.dim()==2 

            if x.dim()==1:
                batch_size = 1
                x = x.reshape(1,-1)
            else:
                batch_size = x.shape[0]

            # ENCODER
            h = torch.zeros(nhidden, batch_size, device=device, dtype=torch.float64, requires_grad=False)  # reset hidden state at start of record
            for t in range(x.shape[1]):
                embedding_step_t = self.embx(x[:,t])
                h = self.encoder(h, embedding_step_t)
            c = h

            # DECODER
            loss = 0.0
            correct = 0
            output = []
            # y_pred is column vector starting with '<' for each record in the batch
            y_pred = ctoi['<']
            y_pred = torch.full(size=(batch_size,1), fill_value=y_pred, device=device, dtype=torch.long) # begin with "start of sequence" char
            output.append(y_pred)
            h = torch.zeros(nhidden, batch_size, device=device, dtype=torch.float64, requires_grad=False)  # reset hidden state at start of record
            while len(output)<MAX_LENGTH+2: # max plus last '>' char
                embedding_step_t = self.emby(y_pred.flatten())  # make it a list of symbols to use in embedding
                h = self.decoder(h, c, embedding_step_t)
                o = self.lin(h)
                p = F.softmax(o, dim=1)
                y_pred = torch.argmax(p, dim=1).reshape(-1,1)
                output.append(y_pred)
            output = torch.cat(output, dim=1) # should be batch_size by (columns(y)-1)
        return output  
    
    def score(self, X_test, Y_test):
        "Return raw accuracy of perfect translations to total records"
        with torch.no_grad():
            y_pred = trans.predict(X_test)
            correct = 0
            for i in range(len(X_test)):
                correct += tostr(y_pred[i])==tostr(Y_test[i])
    #     y_pred_real_char = torch.sum(y_pred>1)
    #     y_real_char = torch.sum(Y_test>1)
    #     print(torch.sum(Y_test>1))
    #     print(y_pred)
    #     print(Y_test)
    #     print(y_pred==Y_test)
        return correct/float(len(X_test))

In [42]:
trans = Transducer(input_sz=len(ctoi),
                   output_sz=len(ctoi),
                   input_embed_sz=char_embed_sz,
                   output_embed_sz=char_embed_sz,
                   nhidden=nhidden,
                   dropout=0.0,
                   useGRU=True)

optimizer = torch.optim.Adam(trans.parameters(), lr=0.0005, weight_decay=0)
scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, 
                                              mode='triangular2',
                                              step_size_up=3,
                                              base_lr=0.00005, max_lr=0.008,
                                              cycle_momentum=False)

history = []
epochs = 10
for epoch in range(1, epochs+1):
    epoch_training_loss = 0.0
    epoch_training_accum_accur = 0.0
    total_compares = 0
    for p in range(0, len(X_train), batch_size):  # do one epoch
        batch_X = X_train[p:p+batch_size]
        batch_Y = Y_train[p:p+batch_size]
        y_pred, loss, correct = trans(batch_X, batch_Y)
        
#         print([tostr(y_) for y_ in y_pred])
#         if epoch==10:
#             print(f"{tostr(x)}->{tostr(y)}: {tostr(y_pred)}, {correct} correct")
        epoch_training_accum_accur += correct
        epoch_training_loss += loss.detach().item()
        total_compares += batch_size * (MAX_LENGTH + 1) # For each "<foo>" predict and count "foo>" but MAX_LENGTH doesn't include <...>

        optimizer.zero_grad()
        loss.backward() # autograd computes U.grad, M.grad, ...
        optimizer.step()

    epoch_training_accur = trans.score(X_train, Y_train)
    epoch_test_accur = trans.score(X_test, Y_test)

    epoch_training_accum_accur /= total_compares
    epoch_training_loss /= total_compares
    
    print(f"Epoch {epoch:3d} training loss {epoch_training_loss:8.3f} char accur {epoch_training_accum_accur:.4f} phrase accur {epoch_training_accur:.4f}    test accur {epoch_test_accur:.3f}   LR {scheduler.get_last_lr()[0]:7.6f}")
    scheduler.step()



Epoch   1 training loss   17.295   char accur 0.1752  phrase accur  0.0000   test accur 0.000   LR 0.000050
Epoch   2 training loss    4.165   char accur 0.6220  phrase accur  0.0028   test accur 0.000   LR 0.002700
Epoch   3 training loss    1.649   char accur 0.7203  phrase accur  0.0198   test accur 0.000   LR 0.005350
Epoch   4 training loss    1.200   char accur 0.7716  phrase accur  0.0354   test accur 0.011   LR 0.008000
Epoch   5 training loss    0.730   char accur 0.8338  phrase accur  0.1768   test accur 0.034   LR 0.005350
Epoch   6 training loss    0.294   char accur 0.9040  phrase accur  0.5884   test accur 0.119   LR 0.002700
Epoch   7 training loss    0.093   char accur 0.9546  phrase accur  0.6181   test accur 0.102   LR 0.000050
Epoch   8 training loss    0.072   char accur 0.9609  phrase accur  0.8685   test accur 0.164   LR 0.001375
Epoch   9 training loss    0.044   char accur 0.9705  phrase accur  0.9180   test accur 0.175   LR 0.002700
Epoch  10 training loss    0

In [35]:
# TEST SINGLE RECORD
print(tostr(X_test[2]), ":", tostr(Y_test[2]))
y_pred = trans.predict(X_test[2], Y_test[2])
tostr(y_pred[0])

<<<<vous conduisez : <you drive>


'<you drive>'

In [36]:
# TEST ALL TEST RECORDS
y_pred = trans.predict(X_test, Y_test)
total_correct = 0
for i,y_ in enumerate(y_pred[0:10]):
    total_correct += tostr(Y_test[i])==tostr(y_)
    print(tostr(y_), "==", tostr(Y_test[i]))
print(total_correct)

<cuff him> == <follow him>
<i am fad> == <i had fun>
<you drive> == <you drive>
<stay down> == <stay down>
<we seated> == <after you>
<i'm him> == <i'm buying>
<i'm fat> == <i'm a hero>
<it worked> == <i phoned>
<i felmo> == <i want you>
<we won> == <it snowed>
2


In [37]:
trans.score(X_train, Y_train)

0.8189533239038189

In [38]:
trans.score(X_test, Y_test)

0.1807909604519774

In [39]:
def check(X,Y,verbose=(0,1,2)):
    "Use Levenshtein to measure how close output predictions are to truth."
    with torch.no_grad():
        total_compares = 0
        total_correct = 0
        total_d = 0
        for i in range(len(X)):
            x = X[i]
            y = Y[i]
            y_pred = trans.predict(x)
            y_pred = y_pred[0] # only one record for now
            total_compares += len(y) - 1 # From "<foo>" predict "foo>" but don't count last '>' for metrics
            total_correct += tostr(y)==tostr(y_pred)
            d = editdistance.eval(tostr(y),tostr(y_pred))
            total_d += d
            if verbose>0:
                if verbose>1 or d>0:
                    print(f"{tostr(x):20s} : {tostr(y)}")
                    print(f"{'':20s} : {tostr(y_pred):20s} Levenshtein {d} out of {len(y)}")
    return total_d/float(len(X)), total_correct/len(X)

In [40]:
avg_d, accur = check(X_train, Y_train, verbose=0)
print(f"Training n={len(X_train)} average Levenshtein score {avg_d:8.2f}, perfect accuracy {accur:8.2f}")

Training n=707 average Levenshtein score     0.85, perfect accuracy     0.82


In [41]:
avg_d, accur = check(X_test, Y_test, verbose=0)
print(f"Testing n={len(X_test)} average Levenshtein score {avg_d:8.2f}, perfect accuracy {accur:8.2f}")

Testing n=177 average Levenshtein score     4.85, perfect accuracy     0.18
