In [2]:
!pip install sacrebleu 

Collecting sacrebleu
  Downloading https://files.pythonhosted.org/packages/45/31/1a135b964c169984b27fb2f7a50280fa7f8e6d9d404d8a9e596180487fd1/sacrebleu-1.4.3-py3-none-any.whl
Collecting portalocker
  Downloading https://files.pythonhosted.org/packages/91/db/7bc703c0760df726839e0699b7f78a4d8217fdc9c7fcb1b51b39c5a22a4e/portalocker-1.5.2-py2.py3-none-any.whl
Collecting typing
  Downloading https://files.pythonhosted.org/packages/fe/2e/b480ee1b75e6d17d2993738670e75c1feeb9ff7f64452153cf018051cc92/typing-3.7.4.1-py3-none-any.whl
Installing collected packages: portalocker, typing, sacrebleu
Successfully installed portalocker-1.5.2 sacrebleu-1.4.3 typing-3.7.4.1


In [3]:
import numpy as np
import re
import sacrebleu
import tensorflow as tf
import time
import unicodedata



In [4]:
 tf.enable_eager_execution() #should be called at startup

AttributeError: module 'tensorflow' has no attribute 'enable_eager_execution'

In [6]:
sentences = [
  ("Do you want a cup of coffee?", "¿Quieres una taza de café?"),
  ("I've had coffee already.", "Ya tomé café."),
  ("Can I get you a coffee?", "¿Quieres que te traiga un café?"),
  ("Please give me some coffee.", "Dame algo de café por favor."),
  ("Would you like me to make coffee?", "¿Quieres que prepare café?"),
  ("Two coffees, please.", "Dos cafés, por favor."),
  ("How about a cup of coffee?", "¿Qué tal una taza de café?"),
  ("I drank two cups of coffee.", "Me tomé dos tazas de café."),
  ("Would you like to have a cup of coffee?", "¿Te gustaría tomar una taza de café?"),
  ("There'll be coffee and cake at five.", "A las cinco habrá café y un pastel."),
  ("Another coffee, please.", "Otro café, por favor."),
  ("I made coffee.", "Hice café."),
  ("I would like to have a cup of coffee.", "Quiero beber una taza de café."),
  ("Do you want me to make coffee?", "¿Quieres que haga café?"),
  ("It is hard to wake up without a strong cup of coffee.", "Es difícil despertarse sin una taza de café fuerte."),
  ("All I drank was coffee.", "Todo lo que bebí fue café."),
  ("I've drunk way too much coffee today.", "He bebido demasiado café hoy."),
  ("Which do you prefer, tea or coffee?", "¿Qué prefieres, té o café?"),
  ("There are many kinds of coffee.", "Hay muchas variedades de café."),
  ("I will make some coffee.",	"Prepararé algo de café.")
]

In [8]:
sentences[0]

('Do you want a cup of coffee?', '¿Quieres una taza de café?')

In [9]:
def preprocess(s):
  # for details, see https://www.tensorflow.org/alpha/tutorials/sequences/nmt_with_attention
  s = ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
  s = re.sub(r"([?.!,¿])", r" \1 ", s)
  s = re.sub(r'[" "]+', " ", s)
  s = re.sub(r"[^a-zA-Z?.!,¿]+", " ", s)
  s = s.strip()
  s = '<start> ' + s + ' <end>'
  return s

In [10]:
print("Original:", sentences[0])
sentences = [(preprocess(source), preprocess(target)) for (source, target) in sentences]
print("Preprocessed:", sentences[0])

Original: ('Do you want a cup of coffee?', '¿Quieres una taza de café?')
Preprocessed: ('<start> Do you want a cup of coffee ? <end>', '<start> ¿ Quieres una taza de cafe ? <end>')


In [11]:
source_sentences, target_sentences = list(zip(*sentences))
source_sentences, target_sentences


(('<start> Do you want a cup of coffee ? <end>',
  '<start> I ve had coffee already . <end>',
  '<start> Can I get you a coffee ? <end>',
  '<start> Please give me some coffee . <end>',
  '<start> Would you like me to make coffee ? <end>',
  '<start> Two coffees , please . <end>',
  '<start> How about a cup of coffee ? <end>',
  '<start> I drank two cups of coffee . <end>',
  '<start> Would you like to have a cup of coffee ? <end>',
  '<start> There ll be coffee and cake at five . <end>',
  '<start> Another coffee , please . <end>',
  '<start> I made coffee . <end>',
  '<start> I would like to have a cup of coffee . <end>',
  '<start> Do you want me to make coffee ? <end>',
  '<start> It is hard to wake up without a strong cup of coffee . <end>',
  '<start> All I drank was coffee . <end>',
  '<start> I ve drunk way too much coffee today . <end>',
  '<start> Which do you prefer , tea or coffee ? <end>',
  '<start> There are many kinds of coffee . <end>',
  '<start> I will make some coff

In [12]:
source_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters=" ")
source_tokenizer.fit_on_texts(source_sentences)
source_data=source_tokenizer.texts_to_sequences(source_sentences)
print("Sequence:", source_data[0])
source_data=tf.keras.preprocessing.sequence.pad_sequences(source_data, padding="post")
print("Sequence:", source_data[0])


Sequence: [1, 12, 8, 19, 9, 10, 6, 3, 7, 2]
Sequence: [ 1 12  8 19  9 10  6  3  7  2  0  0  0  0  0]


In [13]:
target_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
target_tokenizer.fit_on_texts(target_sentences)
target_data = target_tokenizer.texts_to_sequences(target_sentences)
target_data = tf.keras.preprocessing.sequence.pad_sequences(target_data, padding='post')

In [14]:
# Create labels for the decoder by shifting the target sequence
# one to the right.
#target_data.shape
target_labels = np.zeros(target_data.shape)
target_labels[:,0:target_data.shape[1] -1] = target_data[:,1:]

In [15]:
print("Target sequence", target_data[0])
print("Target label", target_labels[0])

Target sequence [ 1  6 11  9 10  5  3  7  2  0  0  0]
Target label [ 6. 11.  9. 10.  5.  3.  7.  2.  0.  0.  0.  0.]


In [17]:
source_vocab_size = len(source_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1

In [18]:
source_vocab_size

65

In [19]:
target_vocab_size

60

In [20]:
def decode(encoded, tokenizer):
  for number in encoded:
    if number !=0:
      print ("%d -> %s" % (number, tokenizer.index_word[number]))
      
decode(source_data[0], source_tokenizer)

1 -> <start>
12 -> do
8 -> you
19 -> want
9 -> a
10 -> cup
6 -> of
3 -> coffee
7 -> ?
2 -> <end>


In [21]:
batch_size = 5
dataset = tf.data.Dataset.from_tensor_slices((source_data, target_data, target_labels)).batch(batch_size)
#dataset

In [22]:
embedding_size = 32
rnn_size = 64

In [28]:
class Encoder(tf.keras.Model):
  def __init__(self):
    super(Encoder, self).__init__()
    
    self.embedding = tf.keras.layers.Embedding(source_vocab_size,
                                               embedding_size)
    self.gru = tf.keras.layers.GRU(rnn_size, 
                                   return_sequences=True, 
                                   return_state=True)
    
  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state=hidden)        
    return output, state
  
  def init_state(self, batch_size):
    return tf.zeros((batch_size, rnn_size))

In [29]:
ex_sentence = tf.expand_dims(source_data[0], axis=0)
ex_sentence
ex_translation = tf.expand_dims(target_data[0], axis=0)
ex_translation
ex_labels = tf.expand_dims(target_labels[0], axis=0)
print(ex_sentence.shape, ex_translation.shape, ex_labels.shape)

(1, 15) (1, 12) (1, 12)


In [30]:
encoder = Encoder()
hidden_state = encoder.init_state(batch_size=1)
print(hidden_state.shape)

output, hidden_state = encoder(ex_sentence, hidden_state)
print(output.shape)


(1, 64)
(1, 15, 64)


In [31]:
class Decoder(tf.keras.Model):
  def __init__(self):
    super(Decoder, self).__init__()
    self.embedding = tf.keras.layers.Embedding(target_vocab_size, 
                                               embedding_size)
    self.gru = tf.keras.layers.GRU(rnn_size, 
                                   return_sequences=True, 
                                   return_state=True)

    self.dense = tf.keras.layers.Dense(target_vocab_size)


  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state=hidden)
    logits = self.dense(output)
    return logits, state

In [37]:
input_sent = source_data[0]
input_sent = tf.expand_dims(input_sent, axis=0)
print(input_sent)

hidden_state = encoder.init_state(batch_size=1)
output, hidden_state = encoder(input_sent, hidden_state)
#print(output, hidden_state)

decoder_input = tf.expand_dims([target_tokenizer.word_index['<start>']], 0)
print(decoder_input)

tf.Tensor([[ 1 12  8 19  9 10  6  3  7  2  0  0  0  0  0]], shape=(1, 15), dtype=int32)


In [33]:
def translate(idx=None):
  
    if idx == None: 
      idx = np.random.choice(len(sentences))
    
    input_sent = source_data[idx]
    input_sent = tf.expand_dims(input_sent, axis=0)
    print(input_sent)
    
    hidden_state = encoder.init_state(batch_size=1)
    output, hidden_state = encoder(input_sent, hidden_state)
    
    decoder_input = tf.expand_dims([target_tokenizer.word_index['<start>']], 0)
    out_words = []
    print(decoder_input)
    
    decoder_state = hidden_state

    while True:
      
        decoder_output, decoder_state = decoder(decoder_input, decoder_state)
        decoder_input = tf.argmax(decoder_output, -1)
        word_idx = decoder_input.numpy()[0][0]
        # if we've predicted 0 (which is reserved, usually this will only happen
        # before the decoder is trained, just stop translating and return
        # what we have)
        if word_idx == 0: 
          out_words.append('<end>')
        else:
          out_words.append(target_tokenizer.index_word[word_idx])

        if out_words[-1] == '<end>' or len(out_words) >= 20:
          break
          
    translation = ' '.join(out_words)    
    return sentences[idx][0], sentences[idx][1], translation

In [49]:
input_sent, target_sent, translation = translate()
print("Input: %s\nTarget: %s\nTranslation: %s\n" % (input_sent, target_sent, translation))

AttributeError: 'Tensor' object has no attribute 'numpy'