In [1]:
#install BLEU score implementation
!pip install sacrebleu 



You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [2]:
#import required libraries
import numpy as np
import re
import sacrebleu
import tensorflow as tf
import time
import unicodedata



In [19]:
# dataset for experiments
sentences = [
  ("Do you want a cup of coffee?", "¿Quieres una taza de café?"),
  ("I've had coffee already.", "Ya tomé café."),
  ("Can I get you a coffee?", "¿Quieres que te traiga un café?"),
  ("Please give me some coffee.", "Dame algo de café por favor."),
  ("Would you like me to make coffee?", "¿Quieres que prepare café?"),
  ("Two coffees, please.", "Dos cafés, por favor."),
  ("How about a cup of coffee?", "¿Qué tal una taza de café?"),
  ("I drank two cups of coffee.", "Me tomé dos tazas de café."),
  ("Would you like to have a cup of coffee?", "¿Te gustaría tomar una taza de café?"),
  ("There'll be coffee and cake at five.", "A las cinco habrá café y un pastel."),
  ("Another coffee, please.", "Otro café, por favor."),
  ("I made coffee.", "Hice café."),
  ("I would like to have a cup of coffee.", "Quiero beber una taza de café."),
  ("Do you want me to make coffee?", "¿Quieres que haga café?"),
  ("It is hard to wake up without a strong cup of coffee.", "Es difícil despertarse sin una taza de café fuerte."),
  ("All I drank was coffee.", "Todo lo que bebí fue café."),
  ("I've drunk way too much coffee today.", "He bebido demasiado café hoy."),
  ("Which do you prefer, tea or coffee?", "¿Qué prefieres, té o café?"),
  ("There are many kinds of coffee.", "Hay muchas variedades de café."),
  ("I will make some coffee.",	"Prepararé algo de café.")
]

In [20]:
sentences[0]

('Do you want a cup of coffee?', '¿Quieres una taza de café?')

In [21]:
# pre-processing function
def preprocess(s):
  # for details, see https://www.tensorflow.org/alpha/tutorials/sequences/nmt_with_attention
  s = ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
  s = re.sub(r"([?.!,¿])", r" \1 ", s)
  s = re.sub(r'[" "]+', " ", s)
  s = re.sub(r"[^a-zA-Z?.!,¿]+", " ", s)
  s = s.strip()
  s = '<start> ' + s + ' <end>'
  return s

In [22]:
# wrapping the sentences with tags
print("Original:", sentences[0])
tagged_sentences = [(preprocess(source), preprocess(target)) for (source, target) in sentences]
print("Preprocessed:", tagged_sentences[0])

Original: ('Do you want a cup of coffee?', '¿Quieres una taza de café?')
Preprocessed: ('<start> Do you want a cup of coffee ? <end>', '<start> ¿ Quieres una taza de cafe ? <end>')


In [23]:
# source target sentence pairs
source_sentences, target_sentences = list(zip(*tagged_sentences))
source_sentences, target_sentences

(('<start> Do you want a cup of coffee ? <end>',
  '<start> I ve had coffee already . <end>',
  '<start> Can I get you a coffee ? <end>',
  '<start> Please give me some coffee . <end>',
  '<start> Would you like me to make coffee ? <end>',
  '<start> Two coffees , please . <end>',
  '<start> How about a cup of coffee ? <end>',
  '<start> I drank two cups of coffee . <end>',
  '<start> Would you like to have a cup of coffee ? <end>',
  '<start> There ll be coffee and cake at five . <end>',
  '<start> Another coffee , please . <end>',
  '<start> I made coffee . <end>',
  '<start> I would like to have a cup of coffee . <end>',
  '<start> Do you want me to make coffee ? <end>',
  '<start> It is hard to wake up without a strong cup of coffee . <end>',
  '<start> All I drank was coffee . <end>',
  '<start> I ve drunk way too much coffee today . <end>',
  '<start> Which do you prefer , tea or coffee ? <end>',
  '<start> There are many kinds of coffee . <end>',
  '<start> I will make some coff

In [24]:
#tokenizing the source sentences
source_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters=" ")
source_tokenizer.fit_on_texts(source_sentences)
source_data=source_tokenizer.texts_to_sequences(source_sentences)
print("Sequence:", source_data[0])
source_data=tf.keras.preprocessing.sequence.pad_sequences(source_data, padding="post")
print("Sequence:", source_data[0])

Sequence: [1, 12, 8, 19, 9, 10, 6, 3, 7, 2]
Sequence: [ 1 12  8 19  9 10  6  3  7  2  0  0  0  0  0]


In [25]:
#tokenizing the target sentences
target_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters=" ")
target_tokenizer.fit_on_texts(target_sentences)
target_data = target_tokenizer.texts_to_sequences(target_sentences)
target_data = tf.keras.preprocessing.sequence.pad_sequences(target_data, padding='post')
print("Sequence:", target_data[0])

Sequence: [ 1  6 11  9 10  5  3  7  2  0  0  0]


In [27]:
print(source_data.shape)

(20, 15)


In [33]:
print(target_data)
target_data.shape[0]

[[ 1  6 11  9 10  5  3  7  2  0  0  0]
 [ 1 20 16  3  4  2  0  0  0  0  0  0]
 [ 1  6 11  8 12 21 17  3  7  2  0  0]
 [ 1 22 18  5  3 13 14  4  2  0  0  0]
 [ 1  6 11  8 23  3  7  2  0  0  0  0]
 [ 1 19 24 15 13 14  4  2  0  0  0  0]
 [ 1  6  8 25  9 10  5  3  7  2  0  0]
 [ 1 26 16 19 27  5  3  4  2  0  0  0]
 [ 1  6 12 28 29  9 10  5  3  7  2  0]
 [ 1 30 31 32 33  3 34 17 35  4  2  0]
 [ 1 36  3 15 13 14  4  2  0  0  0  0]
 [ 1 37  3  4  2  0  0  0  0  0  0  0]
 [ 1 38 39  9 10  5  3  4  2  0  0  0]
 [ 1  6 11  8 40  3  7  2  0  0  0  0]
 [ 1 41 42 43 44  9 10  5  3 45  4  2]
 [ 1 46 47  8 48 49  3  4  2  0  0  0]
 [ 1 50 51 52  3 53  4  2  0  0  0  0]
 [ 1  6  8 54 15 12 55  3  7  2  0  0]
 [ 1 56 57 58  5  3  4  2  0  0  0  0]
 [ 1 59 18  5  3  4  2  0  0  0  0  0]]


20

In [29]:
#creating the class labels by shifting one keyword
target_labels = np.zeros(target_data.shape)
#print(target_labels)
target_labels[:,0:target_data.shape[1] -1] = target_data[:,1:]
print(target_labels)


[[ 6. 11.  9. 10.  5.  3.  7.  2.  0.  0.  0.  0.]
 [20. 16.  3.  4.  2.  0.  0.  0.  0.  0.  0.  0.]
 [ 6. 11.  8. 12. 21. 17.  3.  7.  2.  0.  0.  0.]
 [22. 18.  5.  3. 13. 14.  4.  2.  0.  0.  0.  0.]
 [ 6. 11.  8. 23.  3.  7.  2.  0.  0.  0.  0.  0.]
 [19. 24. 15. 13. 14.  4.  2.  0.  0.  0.  0.  0.]
 [ 6.  8. 25.  9. 10.  5.  3.  7.  2.  0.  0.  0.]
 [26. 16. 19. 27.  5.  3.  4.  2.  0.  0.  0.  0.]
 [ 6. 12. 28. 29.  9. 10.  5.  3.  7.  2.  0.  0.]
 [30. 31. 32. 33.  3. 34. 17. 35.  4.  2.  0.  0.]
 [36.  3. 15. 13. 14.  4.  2.  0.  0.  0.  0.  0.]
 [37.  3.  4.  2.  0.  0.  0.  0.  0.  0.  0.  0.]
 [38. 39.  9. 10.  5.  3.  4.  2.  0.  0.  0.  0.]
 [ 6. 11.  8. 40.  3.  7.  2.  0.  0.  0.  0.  0.]
 [41. 42. 43. 44.  9. 10.  5.  3. 45.  4.  2.  0.]
 [46. 47.  8. 48. 49.  3.  4.  2.  0.  0.  0.  0.]
 [50. 51. 52.  3. 53.  4.  2.  0.  0.  0.  0.  0.]
 [ 6.  8. 54. 15. 12. 55.  3.  7.  2.  0.  0.  0.]
 [56. 57. 58.  5.  3.  4.  2.  0.  0.  0.  0.  0.]
 [59. 18.  5.  3.  4.  2.  0.  

In [34]:
print("Target sequence", target_data[0])
print("Target label", target_labels[0])

Target sequence [ 1  6 11  9 10  5  3  7  2  0  0  0]
Target label [ 6. 11.  9. 10.  5.  3.  7.  2.  0.  0.  0.  0.]


In [35]:
#calculate vocabulary size
source_vocab_size = len(source_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1
source_vocab_size,target_vocab_size

(65, 60)

In [36]:
# define decode function

def decode(encoded, tokenizer):
  for number in encoded:
    if number !=0:
      print ("%d -> %s" % (number, tokenizer.index_word[number]))


In [37]:
decode(source_data[0], source_tokenizer)

1 -> <start>
12 -> do
8 -> you
19 -> want
9 -> a
10 -> cup
6 -> of
3 -> coffee
7 -> ?
2 -> <end>


In [38]:
dataset = tf.data.Dataset.from_tensor_slices([8, 3, 0, 8, 2, 1])
for elem in dataset:
  print(elem.numpy())

8
3
0
8
2
1


In [40]:
# define the dataset with batch size
batch_size = 5
dataset = tf.data.Dataset.from_tensor_slices((source_data, target_data, target_labels)).batch(batch_size)
for elem in dataset:
  print(elem)

(<tf.Tensor: id=41, shape=(5, 15), dtype=int32, numpy=
array([[ 1, 12,  8, 19,  9, 10,  6,  3,  7,  2,  0,  0,  0,  0,  0],
       [ 1,  5, 20, 26,  3, 27,  4,  2,  0,  0,  0,  0,  0,  0,  0],
       [ 1, 28,  5, 29,  8,  9,  3,  7,  2,  0,  0,  0,  0,  0,  0],
       [ 1, 13, 30, 14, 21,  3,  4,  2,  0,  0,  0,  0,  0,  0,  0],
       [ 1, 15,  8, 16, 14, 11, 17,  3,  7,  2,  0,  0,  0,  0,  0]])>, <tf.Tensor: id=42, shape=(5, 12), dtype=int32, numpy=
array([[ 1,  6, 11,  9, 10,  5,  3,  7,  2,  0,  0,  0],
       [ 1, 20, 16,  3,  4,  2,  0,  0,  0,  0,  0,  0],
       [ 1,  6, 11,  8, 12, 21, 17,  3,  7,  2,  0,  0],
       [ 1, 22, 18,  5,  3, 13, 14,  4,  2,  0,  0,  0],
       [ 1,  6, 11,  8, 23,  3,  7,  2,  0,  0,  0,  0]])>, <tf.Tensor: id=43, shape=(5, 12), dtype=float64, numpy=
array([[ 6., 11.,  9., 10.,  5.,  3.,  7.,  2.,  0.,  0.,  0.,  0.],
       [20., 16.,  3.,  4.,  2.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 6., 11.,  8., 12., 21., 17.,  3.,  7.,  2.,  0.,  0.

In [41]:
#configuration data
embedding_size = 32
rnn_size = 64


In [63]:
# define encoder class

class Encoder(tf.keras.Model):
  def __init__(self):
    super(Encoder, self).__init__()
    
    self.embedding = tf.keras.layers.Embedding(source_vocab_size,
                                               embedding_size)
    self.gru = tf.keras.layers.GRU(rnn_size, 
                                   return_sequences=True, 
                                   return_state=True)
    
  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state=hidden)        
    return output, state
  
  def init_state(self, batch_size):
    return tf.zeros((batch_size, rnn_size))



In [55]:
ex_sentence = tf.expand_dims(source_data[0], axis=0)
ex_sentence
ex_translation = tf.expand_dims(target_data[0], axis=0)
ex_translation
ex_labels = tf.expand_dims(target_labels[0], axis=0)
print(ex_sentence.shape, ex_translation.shape, ex_labels.shape)

(1, 15) (1, 12) (1, 12)


In [62]:
encoder = Encoder()
hidden_state = encoder.init_state(batch_size=1)
print(hidden_state.shape)

output, hidden_state = encoder(ex_sentence, hidden_state)
# print(output)

(1, 64)


In [65]:
# define the decoder class
class Decoder(tf.keras.Model):
  def __init__(self):
    super(Decoder, self).__init__()
    self.embedding = tf.keras.layers.Embedding(target_vocab_size, 
                                               embedding_size)
    self.gru = tf.keras.layers.GRU(rnn_size, 
                                   return_sequences=True, 
                                   return_state=True)

    self.dense = tf.keras.layers.Dense(target_vocab_size)


  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state=hidden)
    logits = self.dense(output)
    return logits, state


In [67]:
#debugging of the constructed items
#debugging of items
input_sent = source_data[0]
input_sent = tf.expand_dims(input_sent, axis=0)
print(input_sent)

hidden_state = encoder.init_state(batch_size=1)
output, hidden_state = encoder(input_sent, hidden_state)
#print(output, hidden_state)

tf.Tensor([[ 1 12  8 19  9 10  6  3  7  2  0  0  0  0  0]], shape=(1, 15), dtype=int32)


In [70]:
decoder_input = tf.expand_dims([target_tokenizer.word_index['<start>']], axis=0)
print(decoder_input)

tf.Tensor([[1]], shape=(1, 1), dtype=int32)


In [75]:
decoder_state=hidden_state
print(hidden_state)
decoder=Decoder()
decoder_output, decoder_state = decoder(decoder_input, decoder_state)
# print(decoder_output, decoder_state)

decoder_input = tf.argmax(decoder_output, -1)
word_idx = decoder_input.numpy()[0][0]
print(decoder_input)
print(target_tokenizer.index_word[word_idx])

tf.Tensor(
[[-0.00914953  0.00451236 -0.00965445  0.00386949  0.00790744 -0.01609711
   0.02677082 -0.01060981  0.00955295  0.01818538  0.01540457  0.01056781
   0.00658734  0.02966356 -0.01944757 -0.00051034  0.02631384  0.0165773
  -0.01260577  0.02279333  0.0023177   0.01755016 -0.01033975  0.02802606
   0.01639891 -0.00434892  0.01363721 -0.02372819  0.01855787 -0.0046172
   0.01304835 -0.00377202  0.01229745 -0.00640432  0.02098239  0.00826915
  -0.00606747  0.01547022  0.00860626 -0.00881946  0.01471723 -0.01078211
  -0.00271665  0.00400213 -0.00543378  0.02301455 -0.00844434 -0.00918204
   0.01584662  0.02288154  0.0114758  -0.00512738  0.0038305   0.01713556
   0.02562306  0.00689743 -0.00112571  0.01507663  0.0413456   0.03438532
  -0.00144762 -0.01492808 -0.00288165  0.00951597]], shape=(1, 64), dtype=float32)
tf.Tensor([[19]], shape=(1, 1), dtype=int64)
dos


In [76]:
#do the translation

def translate(idx=None):
  
    if idx == None: 
      idx = np.random.choice(len(sentences))
    
    input_sent = source_data[idx]
    input_sent = tf.expand_dims(input_sent, axis=0)
    print(input_sent)
    
    hidden_state = encoder.init_state(batch_size=1)
    output, hidden_state = encoder(input_sent, hidden_state)
    
    decoder_input = tf.expand_dims([target_tokenizer.word_index['<start>']], 0)
    out_words = []
    print(decoder_input)
    
    decoder_state = hidden_state
    
    decoder=Decoder()

    while True:
      
        decoder_output, decoder_state = decoder(decoder_input, decoder_state)
        decoder_input = tf.argmax(decoder_output, -1)
        word_idx = decoder_input.numpy()[0][0]
        # if we've predicted 0 (which is reserved, usually this will only happen
        # before the decoder is trained, just stop translating and return
        # what we have)
        if word_idx == 0: 
          out_words.append('<end>')
        else:
          out_words.append(target_tokenizer.index_word[word_idx])

        if out_words[-1] == '<end>' or len(out_words) >= 20:
          break
          
    translation = ' '.join(out_words)    
    return sentences[idx][0], sentences[idx][1], translation


In [77]:
input_sent, target_sent, translation = translate()
print("Input: %s\nTarget: %s\nTranslation: %s\n" % (input_sent, target_sent, translation))

tf.Tensor([[ 1 32 33  9 10  6  3  7  2  0  0  0  0  0  0]], shape=(1, 15), dtype=int32)
tf.Tensor([[1]], shape=(1, 1), dtype=int32)
Input: How about a cup of coffee?
Target: ¿Qué tal una taza de café?
Translation: ¿ <start> <start> habra y dame sin taza bebido y tomar o cinco lo dos fuerte hice quieres hice habra

