# __Machine Translation from Spanish to English__
* Architecture Used: GRU’s
* Programming Languages, Libraries, and Frameworks: Python 3.x, NumPy, Matplotlib, sklearn, TensorFlow 2.x, Keras[Tensorflow Backend], unicodedata, os, io, time

* __Dataset__:-
manythings.org is currently the best website that has the largest collection of Tab-delimited Bilingual Sentence Pairs. It has Tab-delimited Bilingual Sentence Pairs for over 30 different languages. The dataset contains the language-translation pairs in the format:-
“English + TAB + The Other Language + TAB + Attribution”
This website only provides only the datasets to translate between “ English and the Other Language”. manythings.org is not the only website, there are many other websites from which you can download the dataset of your choice.

## Libraries

In [1]:
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata 
#This module provides access to the Unicode Character Database (UCD) which
# defines character properties for all Unicode characters
import re
import numpy as np
import os
import io
import time

import warnings
warnings.filterwarnings('ignore')

## Preprocessing
1. Add a 'start' and 'end' token.
2. Removing the special characters.
3. Creating a dictionary mapping from word → id and id →word
4. Representing each sentence in its vectorized form.
5. Padding each Sentence to a maximum length.

In [2]:
path_to_zip = tf.keras.utils.get_file(
    'spa-eng.zip', origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract=True)

path_to_file = os.path.dirname(path_to_zip)+"/spa-eng/spa.txt"

In [3]:
# Converts the unicode file to ascii
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
      if unicodedata.category(c) != 'Mn')


def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())

    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)

    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

    w = w.strip()

    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    return w

In [4]:
# 1. Remove the accents
# 2. Clean the sentences
# 3. Return word pairs in the format: [ENGLISH, SPANISH]
def create_dataset(path, num_examples):
    lines = io.open(path, encoding='UTF-8').read().strip().split('\n')

    word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_examples]]

    return zip(*word_pairs)

def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
      filters='')
    lang_tokenizer.fit_on_texts(lang)

    tensor = lang_tokenizer.texts_to_sequences(lang)

    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                         padding='post')

    return tensor, lang_tokenizer


def load_dataset(path, num_examples=None):
    # creating cleaned input, output pairs
    targ_lang, inp_lang = create_dataset(path, num_examples)

    input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
    target_tensor, targ_lang_tokenizer = tokenize(targ_lang)

    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [5]:
# we will be using the first 50000 examples from the dataset
num_examples = 50000

# using the preprocess module to preprocess the data
input_tensor, target_tensor, inp_lang, tar_lang = load_dataset(path_to_file, num_examples) 
max_length_tar, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]
print(max_length_tar, max_length_inp)

12 16


In [6]:
# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2, random_state=42)

# Show length
print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

40000 40000 10000 10000


In [7]:
#This function is to just display how the data is encoded
def convert(lang, tensor):
    for t in tensor:
        if t!=0:
            print ("%d ----> %s" % (t, lang.index_word[t]))

In [8]:
print ("Input Language; index to word mapping")
convert(inp_lang, input_tensor_train[1])
print ()
print ("Target Language; index to word mapping")
convert(tar_lang, target_tensor_train[1])

Input Language; index to word mapping
1 ----> <start>
7 ----> no
488 ----> seas
75 ----> tan
346 ----> duro
150 ----> conmigo
3 ----> .
2 ----> <end>

Target Language; index to word mapping
1 ----> <start>
28 ----> don
12 ----> t
37 ----> be
88 ----> so
229 ----> hard
43 ----> on
16 ----> me
3 ----> .
2 ----> <end>


__Now we will create a TensorFlow dataset object so that we can easily access it while training and then divide it into batches.__

In [9]:
len(input_tensor_train)

40000

In [10]:
#Size of buffer while shuffling the data
BUFFER_SIZE = len(input_tensor_train)

BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train) // BATCH_SIZE

#Size of the embedding layer
embedding_dim = 256
# No. of hidden units in the GRU
units = 1024

#Size of the Vocabulary at encoder (i.e., Spanish)
vocab_inp_size = len(inp_lang.word_index)+1
#Size of the Vocabulary at Decoder (i.e., English)
vocab_tar_size = len(tar_lang.word_index)+1

# Creating the Tensorflow Dataset object
dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

2023-03-24 22:14:11.958995: I tensorflow/core/platform/cpu_feature_guard.cc:145] This TensorFlow binary is optimized with Intel(R) MKL-DNN to use the following CPU instructions in performance critical operations:  SSE4.1 SSE4.2
To enable them in non-MKL-DNN operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-24 22:14:11.961898: I tensorflow/core/common_runtime/process_util.cc:115] Creating new thread pool with default inter op setting: 8. Tune using inter_op_parallelism_threads for best performance.


In [11]:
example_inp_batch, example_tar_batch = next(iter(dataset))
example_inp_batch.shape, example_tar_batch.shape

(TensorShape([64, 16]), TensorShape([64, 12]))

## __Creating the Model__
The model we will be building is an encoder-decoder model with an extension of attention.
1. __Encoder__:-
This part of the model uses the simplest type of architecture. We will be using an Embedding Layer followed by GRUs. We will be using Custom Keras layers.

In [12]:
# Creating a Custom Layer
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_size):
        super(Encoder, self).__init__()
        '''
        vocab_size : Size of the Vocabulary
        embedding_dim : Size of the Embedding layer
        enc_units : No. of Hidden Units in the GRU
        batch_sz : Batch Size
        '''
        
        self.enc_units = enc_units
        self.batch_size = batch_size
        
        # Embedding Layer
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        
        # GRU
        self.gru = tf.keras.layers.GRU(self.enc_units, 
                                      return_sequences=True,
                                       return_state=True,
                                      recurrent_initializer = 'glorot_uniform')
        
    # Function that will create the model with the layers
    def call(self, inp, hidden):
        '''
        inp: input to the model.i.e., vectorized form of the spanish sentence
        hidden: intial hidden_state of the gru.
        '''
        x = self.embedding(inp)
        output, state = self.gru(x, initial_state=hidden)
        return output, state
    
    # Function to initialize the hidden state of the gru
    def initialize_hidden_state(self):
        return tf.zeros([self.batch_size, self.enc_units])
    

This will create the encoder Layer:

In [13]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

The way this model works is that we input a sentence in the Spanish language into the encoder part. The encoder RNN encodes the sentence and passes it to the decoder RNN which outputs its English Translation. For better results, we use an embedding layer at the encoder part to embed the input sentence. 

We can use the simple encoder and decoder model for our task but these types of architectures don’t work well with long sentences. We may be able to get pretty good results with short sentences but as the length of the sentences increases the accuracy of the model decreases. So to overcome this issue we will be using __Attention__.



2.__Bahdanau Attention__ :-
We know that RNNs are used when we want to account for previous information also. Such that an RNN takes into account the previous information to make a decision. But as the length of the sequence increases the impact of the starting words of the sentence starts decreasing due to the effect called Vanishing Gradients.

What it will do is learn how to pay attention to different parts of the sentence while making a decision.
These ideas are useful while making a Translator model because the model can easily decide on which part of the sentence to focus on while translating.

![image.png](attachment:b18d8b21-9276-43eb-970d-db63013ea1cf.png)

A: The output of all the GRUs is collected at this point. The output will be of the shape (batch_size,length_of_sequence,dictionary_size).

B: The output of the hidden state is collected at this point. Its shape is (batch_size,hidden_units). It is a 2-D vector but we have to convert it into 3-D to work with the ‘A’ part. So we will just expand the dimensions i.e., (batch_size,1,hidden_units).

STEP-1: After this, both A and B are passed into a dense layer with hidden units “h_u” that will convert A into (batch_size,length_of_sequence,h_u) and B into (batch_size,1,h_u). After this, both A and B are added together and we get the shape (batch_size,length_of_sequence,h_u). This vector is then again passed into a dense layer with only one perceptron so we get the final output in the shape (batch_size,length_of_sequence,1).

STEP-2: After this, there is a softmax layer which gives us the attention weights. These attention weights store information about how much attention each word should get.

STEP-3: These attention weights are then multiplied with the original output of the encoder “The output of all the GRUs” and then added along the “length_of_sequence” axis to get a Context Vector that can be used at the decoder to generate the translation.

In [14]:
# Creating a Custom layer
class Attention(tf.keras.layers.Layer):
    def __init__(self,units):
        super(Attention, self).__init__()
        '''units: Number of hidden units, represented as "h_u" '''
        
        self.units = units
        self.W1 = tf.keras.layers.Dense(self.units)
        self.W2 = tf.keras.layers.Dense(self.units)
        self.V = tf.keras.layers.Dense(1)
        
    #funtion that will create the model with layers
    def call(self, query, values):
        '''
        query : The hidden state of the GRUs "refer to B"
        values: The output of the GRUs "refer to A"
        '''
        
        #To convert the query from 2-D to 3D
        query_expanded = tf.expand_dims(query, 1)
        
        
        #Step:1 
        '''Both query and Values are passed into a dense layer with units "h_u"
        and,added together,after that a tanh activation is applied and finally
        the result is passed into a dense layer with one neuron. '''
        score = self.V(tf.keras.activations.tanh(
                                                self.W1(query_expanded)+
                                                self.W2(values)
                                                ))
        
        #Step:2
        attention_weights = tf.keras.activations.softmax(score, 1)
        
        #Step:3
        context_vector = attention_weights*values
        context_vector = tf.reduce_sum(context_vector, 1)
        
        return context_vector, attention_weights

In [15]:
# This will create the Attention Layer:
attention_layer = Attention(10)         #h_u = 10

3.__Decoder__:-
Now we have to create the decoder model that will translate the Encoded Text into English. The Decoder is the same as the Encoder but with a few changes. The Behaviour of the Decoder model will Change depending on whether it is in the “training process” or “testing process”. First, let us focus on the training part. As mentioned in the preprocessing step, each sentence starts with a <start> token. So irrespective of whether we are training or testing, the first input to the decoder model will be a start token. Consider the Diagram Below:-
    ![image.png](attachment:c235e79c-3b17-4048-b2eb-0a330adf3667.png)
This is the architecture of our model. You might be wondering why we are giving the decoder model an input. Remember we are considering the ‘Training Phase’ and during training, we do not use sampling rather we provide the input to the decoder so that it can predict the next word based on the true input and the attention vector. And then the error is calculated based on the word generated by the model.
    
1. A Spanish sentence is given to the encoder.
2. The encoded sentence is passed to the attention Mechanism.
3. The attention mechanism generates a vector that completely encodes the sentence.
4. <start> token is passed to the embedding layer of the decoder.
5. The output from the embedding layer is concatenated with the encoded vector from step 3.
6. The concatenated vector is then passed to the GRU to predict the next word.
7. The predicted word is compared with the true word to calculate the error.
8. If “Estoy Bien” is input and its ground truth is “I am Fine”, then the step-7 should predict the word “I”.If there is any other word then the error will be calculated.
9. After that, “I” is passed to the decoder that passes through the embedding layer and which then after concatenation with the vector from step 3 is passed to the GRU that tries to predict the next word. This is how the training is done. This is also called “Teacher forcing”
10. Remember: We provide the true input to the decoder at the time of training but during testing, we don’t provide any input. We use the method of sampling.

In [16]:
# Custom Layer
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_size):
        super(Decoder, self).__init__()
        '''
        vocab_size : Size of vocabulary for the resulting sentence (i.e.,English)
        embedding_dim : Size of the Embedding layer
        dec_units : No. of Hidden Units in the GRU
        batch_size : Batch Size
        '''
        self.dec_units = dec_units
        self.batch_size = batch_size
        
        # Embedding layer
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        
        #GRU
        self.gru = tf.keras.layers.GRU(dec_units, 
                                      return_sequences=True,
                                      return_state = True, 
                                      recurrent_initializer = 'glorot_uniform')
        
        #Dense Layer
        self.dense = tf.keras.layers.Dense(vocab_size)
        
        #Instance of attention layer
        self.attention = Attention(self.dec_units)
        
    # Function to create the Layers
    def call(self, x, hidden, enc_output):
        '''
        x :Input to the decoder
        hidden: Hidden state of the GRU
        enc_output: Output from the encoder layer (not attention layer)
        '''

        '''combined the attention layer with the decoder layer.So that the attention 
        is calculated at the decoder.'''
        
        #Calculate the Attention Vector
        context_vector, attention_weights = self.attention(hidden, enc_output)
        
        #Embedding Layer
        x = self.embedding(x)
        
        #Concatenate the attention vector with x
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        
        output, state = self.gru(x)
        
        #Reshaping the output to (Batch_size, Vocab_dims)
        output = tf.reshape(output, (-1, output.shape[2]))
        
        #passing it through a Dense Layer
        x = self.dense(output)
        
        return x, state, attention_weights
    

In [17]:
# Creating the Decoding Layer:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

### Training the Model:
We will be using Adam Optimizer and the SparseCategoricalCrossEntropy loss. we are providing the input as words represented in integers but the output from the decoder is not a single integer instead it is a vector of size equal to the dictionary size. We choose the index of the word from the outputted dictionary such that it has the highest probability. When calculating the loss we have to deal with the whole dictionary.

In [18]:
#Optimizer
optimizer = tf.keras.optimizers.Adam()

# Loss Object
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

'''We will be taking bulks of data from the dataset while training
This function just uses the Loss Object to deal with a bulk of
data,and calculate loss for the whole bulk'''
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss(real, pred)
    
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    
    return tf.reduce_mean(loss_)
    

Creating Checkpoints and Storing Model Progress:

In [19]:
# Path where the checkpoint is to be stored
checkpoint_dir = './training_checkpoints'

# Storing the index of the checkpoint
checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt')

# Checkpoint object that will store the state of each layer
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                encoder=encoder, 
                                decoder = decoder)

# we combined the Attention layer with the decoder layer so we don't
#have to save it explicitly.

1. Define the number of Epochs and Initialize the hidden state of the Encoder model.
2. Initialize total loss(initially zero).
3. Get the bulk of data from the dataset.
4. Pass this data to the encoder. The encoder will return the encoded sentences and its hidden state.
5. Pass the hidden state and the encoded outputt to the decoder along with the true output. i.e., original translation.
6. Pass the hidden state and encoded output to the attention mechanism to generate a new encoding vector.
7. Pass the 'start' token to the decoder. Note: As we are using batches of data to train so we have to pass 'start' token to every batch. This can be done by duplicating 'start' equal to the sie of the batch.
    
8. Step 6 and 7 are done simultaneously in the decoder layer. Then the decoder layer generates predictions and its hidden state.
9. Calculate the error on the predictions. As we calculated loss for the whole batch, we have to account for it by dividing the loss by the length of the sequence. It is just a convention.
10. Now, get all the trainable variables., weights and calculate the gradient(Derivative) of those variables with respect to loss. 

Finally, update the variables by applying gradients.
That’s it. This is how you can train a Neural Machine Translation Model. For convenience, We have divided this into two functions. One for steps 1–3 and the other for 4–10.
    
Function for steps 4–10:-

In [20]:
def training_step(inp, targ, enc_hidden_state):
    loss = 0
    with tf.GradientTape() as tape:
        # Step: 4
        enc_output, enc_hidden_state = encoder(inp, enc_hidden_state)
        dec_hidden_state = enc_hidden_state
        
        #Step: 7
        dec_input = tf.expand_dims([tar_lang.word_index['<start>']]*BATCH_SIZE, 1)
        #We have to concatenate the decoder input with the attention vector
        #So we have to convert it into 2-D by expanding Dims.
        
        for t in range(1, targ.shape[1]):
            predictions, dec_hidden_state, _  = decoder(dec_input, dec_hidden_state, enc_output)
            
            
            #Increment the loss,targ[:,t] means we are calculating loss for the 
            #t'th word in all batches at the same time.(BULK!)
            loss += loss_function(targ[:, t], predictions)
            
            dec_input = tf.expand_dims(targ[:, t], 1)
    
    batch_loss = (loss/int(targ.shape[1]))
    
    # Get all the trainable variables
    variables = encoder.trainable_variables + decoder.trainable_variables
    
    #Calculate the Derivative
    gradients = tape.gradient(loss, variables)
    
    #Apply the gradients
    optimizer.apply_gradients(zip(gradients, variables))
    
    return batch_loss


__Training the model__

In [None]:
Epochs = 10
for epoch in range(Epochs):
    # to calculate the time take for each epoch
    start = time.time()
    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0
    
    # Taking the batches of data
    for (batch, (inp, tar)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = training_step(inp, tar, enc_hidden)
        total_loss += batch_loss
        
        # Printing stats after 100 batches
        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                         batch,
                                                         batch_loss.numpy()))
            
        #saving (checkpoint) the model after every 2 epochs
        
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix= checkpoint_prefix)

    # Printing some stats:
    print('Epoch {}, Loss{:.4f}'.format(epoch+1, 
                                       total_loss/steps_per_epoch))
    print('Time taken for 1 Epoch {} sec\n'.format(time.time()-start))

Epoch 1 Batch 0 Loss 4.4711
Epoch 1 Batch 100 Loss 2.2360


## Evaluating and Testing the Model
 During Training, we provided the decoder with the actual output so that it can learn. But now we want to convert over Spanish text into English. We don’t have the English translation. We want the model to provide us with English translation. But the decoder model requires the previous word to predict the next word. This is where sampling comes into place.
 ![image.png](attachment:81890ea7-9f54-4b5a-8d9c-8d928f788894.png)
 What we are doing is at first we are passing the <start> token to the decoder. It then predicts the next word. This word is no more used for calculating error. In fact, this word will be used as input to the next GRU (or Time Step). At the next time step, the model generated another word. We keep track of all the words by appending it to an empty string. So when do we stop generating new words?
When the maximum length of the output sequence is achieved or As soon as the model predicts the next word as <end> we will stop feeding the RNN.
    
overview:-

1. Input a sentence in Spanish.
2. Preprocess the Sentence.
3. Split the sentence into words and pad zeros at the end of the length of the sentence is less than the length accepted by the RNN.
4. Covert it into a tensor.
5. Initialize hidden state of encoder, feed the sentence to the encoder, and get the encoded output.
6. Input the 'start' token in the decoder.
7. Start a loop for predicting the translation of the given sentence word by word.
7. At each step, the GRU will output a vector of size equal to the dictionary size. We pick the index of the highest value from the vector (which refers to the most probable value), convert it into its corresponding word using the dictionary, append the word to the output string, pass the index of that word again to the decoder model to predict the next word.SAMPLING!

In [None]:
# Function to translate a sentence
def evaluate(sentence):
    "This is to store the attentionn vector for plotting"
    attention_plot = np.zeros((max_length_tar, max_length_inp))
    
    #preprocessing the sentence. Steps 2, 3, 4
    sentence = preprocess_sentence(sentence)
    inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs], 
                                                          maxlen=max_length_inp,
                                                          padding='post')
    
    #Step:4
    inputs = tf.convert_to_tensor(inputs)
    
    #Creating a string to store the translated sentence
    result = ''
    
    #Step: 5
    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)
    dec_hidden = enc_hidden
    
    #Step: 6
    dec_input = tf.expand_dims([tar_lang.word_index['<start>']], 0)
    
    
    #Step: 7
    for t in range(max_length_tar):
        predictions, dec_hidden, attention_weights = decoder(dec_input, 
                                                            dec_hidden, 
                                                            enc_out)
        
        #Storing the attention weights to plot later
        attention_weights = tf.reshape(attention_weights, (-1))
        attention_plot[t] = attention_weights.numpy()
        
        
        #Step:8
        predicted_id = tf.argmax(predictions[0]).numpy()
        
#         if tar_lang.index_word[predicted_id] != '<end>':
        result += tar_lang.index_word[predicted_id] + ' '
        
        
        if tar_lang.index_word[predicted_id] == '<end>':
            return result, sentence, attention_plot
        
        # the predicted Id is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)
        
    #Return the original sentence, Translated Sentence and the history if attention
    return result, sentence, attention_plot

In [None]:
sentence = u'Esta es la primera vez que te conocí'
translation, _, _ = evaluate(sentence)
print(translation)

## Visualizing Resut

In [None]:
from pylab import *
def plot_attention(attention, sentence, predicted_sentence):
    fig = plt.figure(figsize=(10,10))
    ax = fig.add_subplot(1, 1, 1)
    ax.matshow(attention, cmap='viridis')

    fontdict = {'fontsize': 14}

    ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)
    ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)

    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()

In [None]:
def translate_sentence(sentence):
    result, sentence, attention_plot = evaluate(sentence)
    
    print('Input: %s' % (sentence))
    print('Predicted translation: {}'.format(result))
    
    attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]
    plot_attention(attention_plot, sentence.split(' '), result.split(' '))

In [None]:
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
     

In [None]:
translate_sentence(u'hace mucho frio aqui.')

In [None]:

translate_sentence(u'¿todavia estan en casa?')

In [None]:
translate_sentence('Esta es mi vida.')

In [None]:
import gradio as gr            

In [None]:
interface = gr.Interface(fn=evaluate, 
                         inputs=gr.inputs.Textbox(lines=2, placeholder='Text to translate'),
                        outputs='text')

In [None]:
interface.launch()

# 