## Grapheme-to-Phoneme task

In [1]:
import re
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import pandas as pd 
import tensorflow as tf
import tensorflow_addons as tfa
import time

In [2]:
with  open('/Users/chess1812/Documents/GitHub/EPAM_Tasks/Task4(grapheme-to-phoneme)/cmudict.txt') as file:
    lines = []
    for line in file:
        lines.append(line.split())

print(' amount of data', len(lines))

 amount of data 135154


As usual  let's  see what we actually have. 

In [3]:
print("some examples:")
for example in lines[0:5]:
    print(example)

some examples:
["'bout", 'B', 'AW1', 'T']
["'cause", 'K', 'AH0', 'Z']
["'course", 'K', 'AO1', 'R', 'S']
["'cuse", 'K', 'Y', 'UW1', 'Z']
["'em", 'AH0', 'M']


So, our task is to translate grapheme to phoneme. Sounds interesting.

First of all we have to divide our date and preprocess, nothing new

In [4]:
graphemes = [ line[0] for line in lines]
phonemes = [ line[1::] for line in lines]

In [5]:
from sklearn.model_selection import train_test_split
grapheme_train, grapheme_test, phoneme_train, phoneme_test = train_test_split(graphemes, phonemes, test_size=0.1, random_state=42)

In [6]:
texts_train = [list(grapheme) for grapheme in grapheme_train]
texts_test = [list(grapheme) for grapheme in grapheme_test]

In [7]:
print('mean phoneme len = ', sum([len(phoneme) for phoneme in phoneme_train] ) / len(phoneme_train))
print('max phoneme len = ', max([len(phoneme) for phoneme in phoneme_train] ) )
print('min phoneme len = ', min([len(phoneme) for phoneme in phoneme_train] ) )

print('mean word len = ', sum([len(word) for word in texts_train] ) / len(texts_train))
print('max word len = ', max([len(word) for word in texts_train] ) )
print('min word len = ', min([len(word) for word in texts_train] ) )

mean phoneme len =  6.385767605518012
max phoneme len =  28
min phoneme len =  1
mean word len =  7.731202420296289
max word len =  28
min word len =  1


phoneme distribution:

In [8]:
from collections import defaultdict
dct = defaultdict(int)
for phoneme in phoneme_train:
    for ph in phoneme:
        dct[ph]+=1

all_cnt = sum(len(phoneme) for phoneme in phoneme_train)
value_cnt = []
for phoneme_type, cnt in dct.items():
    value_cnt.append((phoneme_type, cnt / all_cnt))
    
sorted(value_cnt, key = lambda x: -x[1])

[('AH0', 0.07314432405709931),
 ('N', 0.07102138134179249),
 ('S', 0.058496148062702126),
 ('L', 0.057889776917214246),
 ('T', 0.0567941891363009),
 ('R', 0.053790656477228256),
 ('K', 0.04998377860629905),
 ('D', 0.037784260613426164),
 ('IH0', 0.0348682719838507),
 ('M', 0.034366181226440355),
 ('Z', 0.03285862154201084),
 ('ER0', 0.027862174799678664),
 ('IY0', 0.025793303396708343),
 ('B', 0.02491915051393495),
 ('EH1', 0.024132541660658743),
 ('P', 0.023206892289945824),
 ('AA1', 0.019596988485385297),
 ('AE1', 0.019545491997445773),
 ('IH1', 0.01829927698930933),
 ('F', 0.01621238181556018),
 ('G', 0.01583388262920469),
 ('V', 0.012525233279090366),
 ('IY1', 0.011984520155725379),
 ('NG', 0.011537788122850022),
 ('EY1', 0.010926267328568192),
 ('HH', 0.010849022596658908),
 ('W', 0.010393278678394133),
 ('SH', 0.010211753558407317),
 ('OW1', 0.010080437514161534),
 ('OW0', 0.009582208993346653),
 ('AO1', 0.0095126887346283),
 ('AY1', 0.008034739530764001),
 ('AH1', 0.007921447257

Add "start" and "end" symbol for feature algorithms. 

In [9]:
def tagger(decoder_input):
    bos = "<BOS> "
    eos = " <EOS>"
    final_target = decoder_input.copy()
    final_target.append(eos)
    final_target.insert(0, bos)
    return final_target

In [10]:
texts_train = [tagger(word) for word in texts_train]
texts_test = [tagger(word) for word in texts_test]

In [11]:
phoneme_train = [tagger(phoneme) for phoneme in phoneme_train]
phoneme_test = [tagger(phoneme) for phoneme in phoneme_test]

Example:

In [12]:
texts_train[0]

['<BOS> ', 's', 'e', 't', 't', 'l', 'e', 'm', 'e', 'n', 't', ' <EOS>']

As is well known, computer, in particular keras framework  works with numbers. So, we need to transform our date to indices.

In [13]:
from keras.preprocessing.text import Tokenizer

phoneme_tokenizer = Tokenizer()             
phoneme_tokenizer.fit_on_texts(phoneme_train)         

y_train = phoneme_tokenizer.texts_to_sequences(phoneme_train)
y_test = phoneme_tokenizer.texts_to_sequences(phoneme_test)

In [14]:
# look at first encoded data point
print("initial  data  example: \n", phoneme_train[0])
print(" Encoded data  example: \n", y_train[0])

initial  data  example: 
 ['<BOS> ', 'S', 'EH1', 'T', 'AH0', 'L', 'M', 'AH0', 'N', 'T', ' <EOS>']
 Encoded data  example: 
 [1, 5, 17, 7, 3, 6, 12, 3, 4, 7, 2]


In [15]:
letter_tokenizer = Tokenizer()             
letter_tokenizer.fit_on_texts(texts_train)         

X_train = letter_tokenizer.texts_to_sequences(texts_train)
X_test = letter_tokenizer.texts_to_sequences(texts_test)

# look at first encoded data point
print("initial  data  example: \n", texts_train[0])
print(" Encoded data  example: \n", X_train[0])

initial  data  example: 
 ['<BOS> ', 's', 'e', 't', 't', 'l', 'e', 'm', 'e', 'n', 't', ' <EOS>']
 Encoded data  example: 
 [1, 7, 3, 10, 10, 11, 3, 14, 3, 8, 10, 2]


In [16]:
BUFFER_SIZE = 32000
BATCH_SIZE = 64
NUM_EXAMPLES = 30000

In [17]:
VOCAB_INPUT_SIZE = len(letter_tokenizer.word_counts) + 1
VOCAB_OUTPUT_SIZE = len(phoneme_tokenizer.word_counts) + 1
VOCAB_SIZE = max(len(letter_tokenizer.word_counts), len(phoneme_tokenizer.word_counts)) + 1
print('number of different characters:', VOCAB_INPUT_SIZE)
print('number of different phonems:', VOCAB_OUTPUT_SIZE)

number of different characters: 37
number of different phonems: 87


In [18]:
INPUT_MAX_LEN = max(len(x) for x in X_train)
OUTPUT_MAX_LEN = max(len(y) for y in y_train)
MAX_LEN =  max(INPUT_MAX_LEN, OUTPUT_MAX_LEN)
print('input max len:', INPUT_MAX_LEN )
print('output max len:', OUTPUT_MAX_LEN)


input max len: 30
output max len: 30


In [19]:
EMBEDDING_DIM = 256
UNITS = 1024
STEPS_PER_EPOCH = NUM_EXAMPLES //BATCH_SIZE

Pad each word to a maximum length. Why? Because we need to fix the maximum length for the inputs to recurrent encoders.

In [20]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_train_pad = pad_sequences(X_train, maxlen=INPUT_MAX_LEN, padding="post", truncating="post")
X_test_pad = pad_sequences(X_test, maxlen=INPUT_MAX_LEN, padding="post", truncating="post")

In [21]:
y_train_pad = pad_sequences(y_train, maxlen=OUTPUT_MAX_LEN, padding="post", truncating="post")
y_test_pad = pad_sequences(y_test, maxlen=OUTPUT_MAX_LEN, padding="post", truncating="post")

### Baseline

Before using some neural nets, let's start with something simple. For our point of view KNN classifier may be a good baseline in this case. Firstly model finds some nearest neighbours and then use their translation with some coefficients to make a decision.  

In [72]:
from sklearn.base import BaseEstimator, ClassifierMixin
import difflib
from collections import defaultdict

class KNNClassifier(BaseEstimator, ClassifierMixin):
    
    def __init__(self, k = 5):
        self.words = {}
        self.k = k
        
    def fit(self, X, y):
        for x, label in zip(X,y):
            self.words[x] = label
    
        return self
    
    
    def predict(self, X):
        y = [None] * len(X)
        for idx, x in enumerate(X):
            if x in self.words:
                y[idx] = self.words[x]
                continue
            closest_words = difflib.get_close_matches(x, list(self.words.keys()), n = self.k)
            distances = [self.distance(word, x) for word in closest_words]
            coeffs = [1 / dist  for dist in distances]
            y[idx] = self.voting(coeffs, np.array([self.words[word] for word in closest_words]).T )
        return y
    
    def predict_proba(self, X):
        pass
    
    def voting(self, coeffs, candidates):
        result = []
        for place in candidates:
            dct = defaultdict(int)
            for coeff,candidate in  zip(coeffs, place):
                dct[candidate] += coeff
            result.append( max([(coeff, candidate) for candidate, coeff in dct.items()])[1])
        return result
    
    def distance(self, a, b):
        "Calculates the Levenshtein distance between a and b."
        n, m = len(a), len(b)
        if n > m:
            # Make sure n <= m, to use O(min(n, m)) space
            a, b = b, a
            n, m = m, n

        current_row = range(n + 1)  # Keep current and previous row, not entire matrix
        for i in range(1, m + 1):
            previous_row, current_row = current_row, [i] + [0] * n
            for j in range(1, n + 1):
                add, delete, change = previous_row[j] + 1, current_row[j - 1] + 1, previous_row[j - 1]
                if a[j - 1] != b[i - 1]:
                    change += 1
                current_row[j] = min(add, delete, change)

        return current_row[n]

Okay, now it's time to train and test....

In [73]:
knn = KNNClassifier(k = 5)

In [74]:
knn.fit(grapheme_train, y_train_pad)

KNNClassifier()

In [75]:
%%time
y_pred = knn.predict(grapheme_test[:1000])

CPU times: user 7min 9s, sys: 202 ms, total: 7min 9s
Wall time: 7min 9s


In [76]:
y_test_pad[5], np.array(y_pred[5])

(array([ 1,  5, 21,  8, 15,  3,  6, 13,  2,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0], dtype=int32),
 array([ 1,  5, 21,  8, 15,  3,  6,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0], dtype=int32))

Hm, somehow, it works. But how well ?

In order to compare our models we will evaluate two metrics: first metric consider only exact match as right prediction. And second one just evaluate all phoneme matches as right prediction. 

In [77]:
def accuracy(y_real, y_pred, return_percent = False):
    right_answer_cnt = 0
    for real, pred in zip(y_real, y_pred):
        if list(real) == list(pred):
            right_answer_cnt+=1
    return right_answer_cnt / len(y_real) if return_percent else right_answer_cnt

In [78]:
def equal_count(y_real, y_pred, return_percent = False):
    right_answer_cnt = 0
    all_count = 0
    for real, pred in zip(y_real, y_pred):
         for real_el, pred_el in zip(real, pred):               
            right_answer_cnt += (real_el == pred_el)
         all_count += len(real)
    return (right_answer_cnt / all_count) if return_percent else right_answer_cnt, all_count

In [79]:
def lev_distance(a, b):
    "Calculates the Levenshtein distance between a and b."
    n, m = len(a), len(b)
    if n > m:
        # Make sure n <= m, to use O(min(n, m)) space
        a, b = b, a
        n, m = m, n

    current_row = range(n + 1)  # Keep current and previous row, not entire matrix
    for i in range(1, m + 1):
        previous_row, current_row = current_row, [i] + [0] * n
        for j in range(1, n + 1):
            add, delete, change = previous_row[j] + 1, current_row[j - 1] + 1, previous_row[j - 1]
            if a[j - 1] != b[i - 1]:
                change += 1
            current_row[j] = min(add, delete, change)

    return current_row[n]

In [80]:
def lev_metric(y_real, y_pred, return_mean_distance = False):
    distance = np.array([lev_distance(real, pred) for real, pred in zip(y_real, y_pred)])
    return np.mean(distance) if return_mean_distance else distance
    

In [81]:
accuracy(y_test_pad[:1000], y_pred, return_percent = True)

0.139

In [82]:
equal_count(y_test_pad[:1000], y_pred, return_percent = True)[0]

0.8969666666666667

In [83]:
lev_metric(y_test_pad[:1000], y_pred, return_mean_distance = True)

2.578

Not so good. But wait, in example above we may see that our model could predict wrong ending. And at the same time  impact on our second metric a lot.  But ending  at some point is our fiction. So, probably we shouldn’t  consider that. 

In [31]:
def trim_result(inputs, beg_idx, end_idx):
    start = 1 if inputs[0] == beg_idx  else  0
    end = list(inputs).index(end_idx) if end_idx in list(inputs) else list(inputs).index(0)
    return inputs[start: end]

In [32]:
start_token = phoneme_tokenizer.word_index['<bos> ']
end_token = phoneme_tokenizer.word_index[' <eos>']

In [33]:
y_pred = [trim_result(y, start_token, end_token) for y in y_pred]
y_real = [trim_result(y, start_token, end_token) for y in y_test_pad]
accuracy(y_real[:1000], np.array(y_pred), return_percent = True)

  This is separate from the ipykernel package so we can avoid doing imports until


0.165

In [34]:
equal_count(y_real[:1000], y_pred, return_percent = True)[0]

0.6447876447876448

Hm, now the result is better (by first metric). It's still  far away from completed. But as a baseline it is okay. 

### Seq2Seq

Seq2Seq architecture is a common  decision  for such problems. Let's implement that! 

In [35]:
train_dataset = tf.data.Dataset.from_tensor_slices((tf.convert_to_tensor(X_train_pad), tf.convert_to_tensor(y_train_pad)))
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

test_dataset = tf.data.Dataset.from_tensor_slices(( tf.convert_to_tensor(X_test_pad), tf.convert_to_tensor(y_test_pad)))
test_dataset = test_dataset.batch(BATCH_SIZE, drop_remainder=True)

In [36]:
example_input_batch, example_target_batch = next(iter(train_dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 30]), TensorShape([64, 30]))

In [37]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)

        ##________ LSTM layer in Encoder ------- ##
        self.lstm_layer = tf.keras.layers.LSTM(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')



    def call(self, x, hidden):
        x = self.embedding(x)
        output, h, c = self.lstm_layer(x, initial_state = hidden)
        return output, h, c

    def initialize_hidden_state(self):
        return [tf.zeros((self.batch_sz, self.enc_units)), tf.zeros((self.batch_sz, self.enc_units))]

In [38]:
## Test Encoder Stack
encoder = Encoder(VOCAB_INPUT_SIZE, EMBEDDING_DIM, UNITS, BATCH_SIZE)


# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_h, sample_c = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder h vecotr shape: (batch size, units) {}'.format(sample_h.shape))
print ('Encoder c vector shape: (batch size, units) {}'.format(sample_c.shape))

Encoder output shape: (batch size, sequence length, units) (64, 30, 1024)
Encoder h vecotr shape: (batch size, units) (64, 1024)
Encoder c vector shape: (batch size, units) (64, 1024)


In [39]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz, attention_type='luong'):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.attention_type = attention_type

        # Embedding Layer
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)

        #Final Dense layer on which softmax will be applied
        self.fc = tf.keras.layers.Dense(vocab_size)

        # Define the fundamental cell for decoder recurrent structure
        self.decoder_rnn_cell = tf.keras.layers.LSTMCell(self.dec_units)



        # Sampler
        self.sampler = tfa.seq2seq.sampler.TrainingSampler()

        # Create attention mechanism with memory = None
        self.attention_mechanism = self.build_attention_mechanism(self.dec_units, 
                                                                  None, self.batch_sz*[INPUT_MAX_LEN], self.attention_type)

        # Wrap attention mechanism with the fundamental rnn cell of decoder
        self.rnn_cell = self.build_rnn_cell(batch_sz)

        # Define the decoder with respect to fundamental rnn cell
        self.decoder = tfa.seq2seq.BasicDecoder(self.rnn_cell, sampler=self.sampler, output_layer=self.fc)


    def build_rnn_cell(self, batch_sz):
        rnn_cell = tfa.seq2seq.AttentionWrapper(self.decoder_rnn_cell, 
                                      self.attention_mechanism, attention_layer_size=self.dec_units)
        return rnn_cell

    def build_attention_mechanism(self, dec_units, memory, memory_sequence_length, attention_type='luong'):
        # ------------- #
        # typ: Which sort of attention (Bahdanau, Luong)
        # dec_units: final dimension of attention outputs 
        # memory: encoder hidden states of shape (batch_size, INPUT_MAX_LEN, enc_units)
        # memory_sequence_length: 1d array of shape (batch_size) with every element set to INPUT_MAX_LEN (for masking purpose)

        if(attention_type=='bahdanau'):
            return tfa.seq2seq.BahdanauAttention(units=dec_units, memory=memory, memory_sequence_length=memory_sequence_length)
        else:
            return tfa.seq2seq.LuongAttention(units=dec_units, memory=memory, memory_sequence_length=memory_sequence_length)

    def build_initial_state(self, batch_sz, encoder_state, Dtype):
        decoder_initial_state = self.rnn_cell.get_initial_state(batch_size=batch_sz, dtype=Dtype)
        decoder_initial_state = decoder_initial_state.clone(cell_state=encoder_state)
        return decoder_initial_state


    def call(self, inputs, initial_state):
        x = self.embedding(inputs)
        outputs, _, _ = self.decoder(x, initial_state=initial_state, sequence_length=self.batch_sz*[OUTPUT_MAX_LEN-1])
        return outputs

In [40]:
# Test decoder stack

decoder = Decoder(VOCAB_OUTPUT_SIZE, EMBEDDING_DIM, UNITS, BATCH_SIZE, 'luong')
sample_x = tf.random.uniform((BATCH_SIZE, OUTPUT_MAX_LEN))
decoder.attention_mechanism.setup_memory(sample_output)
initial_state = decoder.build_initial_state(BATCH_SIZE, [sample_h, sample_c], tf.float32)


sample_decoder_outputs = decoder(sample_x, initial_state)

print("Decoder Outputs Shape: ", sample_decoder_outputs.rnn_output.shape)

Decoder Outputs Shape:  (64, 29, 87)


defind optimisation process

In [41]:
optimizer = tf.keras.optimizers.Adam()

def loss_function(real, pred):
    # real shape = (BATCH_SIZE, max_length_output)
    # pred shape = (BATCH_SIZE, max_length_output, tar_vocab_size )
    cross_entropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
    loss = cross_entropy(y_true=real, y_pred=pred)
    mask = tf.logical_not(tf.math.equal(real,0))   #output 0 for y=0 else output 1
    mask = tf.cast(mask, dtype=loss.dtype)  
    loss = mask* loss
    loss = tf.reduce_mean(loss)
    return loss

In [42]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

Training:

In [43]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_h, enc_c = encoder(inp, enc_hidden)


        dec_input = targ[ : , :-1 ] # Ignore <end> token
        real = targ[ : , 1: ]         # ignore <start> token

        # Set the AttentionMechanism object with encoder_outputs
        decoder.attention_mechanism.setup_memory(enc_output)

        # Create AttentionWrapperState as initial_state for decoder
        decoder_initial_state = decoder.build_initial_state(BATCH_SIZE, [enc_h, enc_c], tf.float32)
        pred = decoder(dec_input, decoder_initial_state)
        logits = pred.rnn_output
        loss = loss_function(real, logits)

    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))

    return loss

In [44]:
EPOCHS = 10

for epoch in range(EPOCHS):
    start = time.time()

    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0
    # print(enc_hidden[0].shape, enc_hidden[1].shape)

    for (batch, (inp, targ)) in enumerate(train_dataset.take(STEPS_PER_EPOCH)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

    if batch % 100 == 0:
        print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))
  # saving (checkpoint) the model every 2 epochs
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / STEPS_PER_EPOCH))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Loss 0.4716
Time taken for 1 epoch 950.9030847549438 sec

Epoch 2 Loss 0.1549
Time taken for 1 epoch 1007.474799156189 sec

Epoch 3 Loss 0.1264
Time taken for 1 epoch 1023.8913309574127 sec

Epoch 4 Loss 0.1093
Time taken for 1 epoch 30184.14626097679 sec

Epoch 5 Loss 0.1001
Time taken for 1 epoch 1272.8709712028503 sec

Epoch 6 Loss 0.0940
Time taken for 1 epoch 1303.9953260421753 sec

Epoch 7 Loss 0.0861
Time taken for 1 epoch 1308.0938849449158 sec

Epoch 8 Loss 0.0830
Time taken for 1 epoch 1260.2351610660553 sec

Epoch 9 Loss 0.0782
Time taken for 1 epoch 1107.125169992447 sec

Epoch 10 Loss 0.0762
Time taken for 1 epoch 1288.6313879489899 sec



In [45]:
def translate_step(inputs, preprocess = False):    
    if preprocess:
        pass
        
        
    inputs = tf.convert_to_tensor(inputs)
    inference_batch_size = inputs.shape[0]
    result = ''

    enc_start_state = [tf.zeros((inference_batch_size, UNITS)), tf.zeros((inference_batch_size,UNITS))]
    enc_out, enc_h, enc_c = encoder(inputs, enc_start_state)

    dec_h = enc_h
    dec_c = enc_c
    
    start_token = phoneme_tokenizer.word_index['<bos> ']
    start_tokens = tf.fill([inference_batch_size], start_token)
    end_token = phoneme_tokenizer.word_index[' <eos>']

    greedy_sampler = tfa.seq2seq.GreedyEmbeddingSampler()

    # Instantiate BasicDecoder object
    decoder_instance = tfa.seq2seq.BasicDecoder(cell=decoder.rnn_cell, sampler=greedy_sampler, output_layer=decoder.fc)
    # Setup Memory in decoder stack
    decoder.attention_mechanism.setup_memory(enc_out)

    # set decoder_initial_state
    decoder_initial_state = decoder.build_initial_state(inference_batch_size, [enc_h, enc_c], tf.float32)


    ### Since the BasicDecoder wraps around Decoder's rnn cell only, you have to ensure that the inputs to BasicDecoder 
    ### decoding step is output of embedding layer. tfa.seq2seq.GreedyEmbeddingSampler() takes care of this. 
    ### You only need to get the weights of embedding layer, which can be done by decoder.embedding.variables[0] and pass this callabble to BasicDecoder's call() function

    decoder_embedding_matrix = decoder.embedding.variables[0]

    outputs, _, _ = decoder_instance(decoder_embedding_matrix, start_tokens = start_tokens, end_token= end_token, initial_state=decoder_initial_state)
    return outputs.sample_id.numpy()
    #return [trim_result(output, end_token) for output in outputs.sample_id.numpy()]

In [46]:
print('predict example:', translate_step(example_input_batch)[2])
print('real: ',  example_target_batch[2].numpy())

predict example: [ 8 20 12  6 54 29 15  2  2  2  2  2  2  2  2  2  2  2]
real:  [ 1  8 45 12  6 52 15  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0]


So, let's transform that into readable format. 

In [47]:
def translate(inputs, translate_algo = translate_step, preprocess = False ):
    result = [trim_result(res, start_token, end_token ) for res in translate_algo(inputs, preprocess = preprocess) ]
    return phoneme_tokenizer.sequences_to_texts(result)

In [48]:
predicted = translate(example_input_batch)
real_indices = [trim_result(target, start_token, end_token) for target in example_target_batch.numpy()]
real = phoneme_tokenizer.sequences_to_texts(real_indices )

print('predict example:', predicted[2])
print('real: ',  real[2])

predict example: r ae1 m l ao2 w iy0
real:  r ae2 m l aw1 iy0


That's better. 

Now it's time to evaluate score. As in baseline example we will consider only  full convergence as right prediction.

In [49]:
from tqdm import tqdm 

In [50]:
right_cnt = 0

for batch, (inp, targ) in tqdm(enumerate(test_dataset.take(STEPS_PER_EPOCH))):
    y_pred = translate(inp)
    real_indices = [trim_result(target, start_token, end_token) for target in targ.numpy()]
    y_real = phoneme_tokenizer.sequences_to_texts(real_indices )
    right_cnt += accuracy(y_real, y_pred)

print('Seq2Seq with attention test accuracy:' ,right_cnt / len(y_test))

211it [02:20,  1.50it/s]

Seq2Seq with attention test accuracy: 0.4756584788398935





But what if we consider accuracy just as number of right predicted phonemes divided by total number of phonemes 

In [51]:
right_cnt = 0
all_cnt = 0
for batch, (inp, targ) in tqdm(enumerate(test_dataset.take(STEPS_PER_EPOCH))):
    y_pred = translate(inp)
    real_indices = [trim_result(target, start_token, end_token) for target in targ.numpy()]
    y_real = phoneme_tokenizer.sequences_to_texts(real_indices )
    right, size = equal_count(y_real, y_pred)
    right_cnt += right
    all_cnt += size
    
print('Seq2Seq with attention test soft accuracy:' ,right_cnt / all_cnt)

211it [02:21,  1.49it/s]

Seq2Seq with attention test soft accuracy: 0.8446156054154906





In [70]:
all_res = []
for batch, (inp, targ) in tqdm(enumerate(test_dataset.take(STEPS_PER_EPOCH))):
    y_pred = translate(inp)
    real_indices = [trim_result(target, start_token, end_token) for target in targ.numpy()]
    y_real = phoneme_tokenizer.sequences_to_texts(real_indices )
    all_res.append(lev_metric(y_real, y_pred))


print('Seq2Seq with attention test levenstain mean distance:' ,
      sum([sum(dist_lst) for dist_lst in all_res ]) / len(y_test))

211it [01:28,  2.39it/s]

Seq2Seq with attention test levenstain mean distance: 1.749408108907961





It's much better than our baseline. But for sure, there are many ways to improve. For instance we may use beam search instead of greedy algorithm. 

In [52]:
def beam_step(inputs, beam_width=3, preprocess = False):
    
    if preprocess:
        pass

    inputs = tf.convert_to_tensor(inputs)
    inference_batch_size = inputs.shape[0]
    result = ''

    enc_start_state = [tf.zeros((inference_batch_size, UNITS)), tf.zeros((inference_batch_size,UNITS))]
    enc_out, enc_h, enc_c = encoder(inputs, enc_start_state)

    dec_h = enc_h
    dec_c = enc_c

    start_tokens = tf.fill([inference_batch_size], phoneme_tokenizer.word_index['<bos> '])
    end_token = phoneme_tokenizer.word_index[' <eos>']

    # From official documentation
    # NOTE If you are using the BeamSearchDecoder with a cell wrapped in AttentionWrapper, then you must ensure that:
    # The encoder output has been tiled to beam_width via tfa.seq2seq.tile_batch (NOT tf.tile).
    # The batch_size argument passed to the get_initial_state method of this wrapper is equal to true_batch_size * beam_width.
    # The initial state created with get_initial_state above contains a cell_state value containing properly tiled final state from the encoder.

    enc_out = tfa.seq2seq.tile_batch(enc_out, multiplier=beam_width)
    decoder.attention_mechanism.setup_memory(enc_out)

    # set decoder_inital_state which is an AttentionWrapperState considering beam_width
    hidden_state = tfa.seq2seq.tile_batch([enc_h, enc_c], multiplier=beam_width)
    decoder_initial_state = decoder.rnn_cell.get_initial_state(batch_size=beam_width*inference_batch_size, dtype=tf.float32)
    decoder_initial_state = decoder_initial_state.clone(cell_state=hidden_state)

    # Instantiate BeamSearchDecoder
    decoder_instance = tfa.seq2seq.BeamSearchDecoder(decoder.rnn_cell,beam_width=beam_width, output_layer=decoder.fc)
    decoder_embedding_matrix = decoder.embedding.variables[0]

    # The BeamSearchDecoder object's call() function takes care of everything.
    outputs, final_state, sequence_lengths = decoder_instance(decoder_embedding_matrix, start_tokens=start_tokens, end_token=end_token, initial_state=decoder_initial_state)
    # outputs is tfa.seq2seq.FinalBeamSearchDecoderOutput object. 
    # The final beam predictions are stored in outputs.predicted_id
    # outputs.beam_search_decoder_output is a tfa.seq2seq.BeamSearchDecoderOutput object which keep tracks of beam_scores and parent_ids while performing a beam decoding step
    # final_state = tfa.seq2seq.BeamSearchDecoderState object.
    # Sequence Length = [inference_batch_size, beam_width] details the maximum length of the beams that are generated


    # outputs.predicted_id.shape = (inference_batch_size, time_step_outputs, beam_width)
    # outputs.beam_search_decoder_output.scores.shape = (inference_batch_size, time_step_outputs, beam_width)
    # Convert the shape of outputs and beam_scores to (inference_batch_size, beam_width, time_step_outputs)
    final_outputs = tf.transpose(outputs.predicted_ids, perm=(0,2,1))[:,0] # taking only best version
  #  beam_scores = tf.transpose(outputs.beam_search_decoder_output.scores, perm=(0,2,1))

    return final_outputs.numpy() #, beam_scores.numpy()

In [53]:
predicted = translate(example_input_batch, translate_algo =  beam_step)
real_indices = [trim_result(target, start_token, end_token) for target in example_target_batch.numpy()]
real = phoneme_tokenizer.sequences_to_texts(real_indices )

print('predict example:', predicted[2])
print('real: ',  real[2])

predict example: r aa0 m l aa1 w iy0
real:  r ae2 m l aw1 iy0


In [54]:
right_cnt = 0

for batch, (inp, targ) in tqdm(enumerate(test_dataset.take(STEPS_PER_EPOCH))):
    y_pred = translate(inp, translate_algo =  beam_step)
    real_indices = [trim_result(target, start_token, end_token) for target in targ.numpy()]
    y_real = phoneme_tokenizer.sequences_to_texts(real_indices )
    right_cnt += accuracy(y_real, y_pred)

print('beam test accuracy: ', right_cnt / len(y_test))

211it [05:29,  1.56s/it]

beam test accuracy:  0.48490677715300384





In [None]:
right_cnt = 0

for batch, (inp, targ) in tqdm(enumerate(test_dataset.take(STEPS_PER_EPOCH))):
    y_pred = translate(inp, translate_algo =  beam_step)
    real_indices = [trim_result(target, start_token, end_token) for target in targ.numpy()]
    y_real = phoneme_tokenizer.sequences_to_texts(real_indices )
    right_cnt += accuracy(y_real, y_pred)

print('beam test accuracy: ', right_cnt / len(y_test))

Also let's evaluate soft accuracy like last time. 

In [55]:
right_cnt = 0
all_cnt = 0
for batch, (inp, targ) in tqdm(enumerate(test_dataset.take(STEPS_PER_EPOCH))):
    y_pred = translate(inp, translate_algo =  beam_step)
    real_indices = [trim_result(target, start_token, end_token) for target in targ.numpy()]
    y_real = phoneme_tokenizer.sequences_to_texts(real_indices )
    right, size = equal_count(y_real, y_pred)
    right_cnt += right
    all_cnt += size
    
print('beam test soft accuracy:' ,right_cnt / all_cnt)

211it [05:10,  1.47s/it]

beam test soft accuracy: 0.8490734257370626





In [71]:
all_res = []
for batch, (inp, targ) in tqdm(enumerate(test_dataset.take(STEPS_PER_EPOCH))):
    y_pred = translate(inp, translate_algo =  beam_step)
    real_indices = [trim_result(target, start_token, end_token) for target in targ.numpy()]
    y_real = phoneme_tokenizer.sequences_to_texts(real_indices )
    all_res.append(lev_metric(y_real, y_pred))


print('Seq2Seq with attention test levenstain mean distance:' ,
      sum([sum(dist_lst) for dist_lst in all_res ]) / len(y_test))

211it [03:13,  1.09it/s]

Seq2Seq with attention test levenstain mean distance: 1.6724622669428826





**Conclusions:** 

In this notebook we try to implement and use seq2seq arhitecture for solving grapheme to phoneme. Also there are a lot of things, that you could try and optimise such us attention mechanism, hyperparameters, search algorithm and so on. 