# Deep Learning Translator - Final Submission

In [111]:
import pandas as pd
import collections
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional
from keras.layers.embeddings import Embedding
from tensorflow.keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy
from keras.models import Sequential
from keras.models import model_from_json
from keras import backend as K
from nltk.translate.bleu_score import corpus_bleu
import re

In [2]:
# Look at the most repeated words in both dataset
rosetta_bible = pd.read_csv('french_and_english_vocab.csv')


In [3]:
# English Sentences
english_sentences = rosetta_bible.English
# Quechua Sentences
quechua_sentences = rosetta_bible.Quechua


# Tokenize

In [4]:
def tokenize(x):
    x_tk = Tokenizer(char_level = False)
    x_tk.fit_on_texts(x)
    return x_tk.texts_to_sequences(x), x_tk

In [5]:
def pad(x, length=None):
    if length is None:
        length = max([len(sentence) for sentence in x])
    return pad_sequences(x, maxlen = length, padding = 'post')

In [6]:
def preprocess(x, y):
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)
    
    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)
# Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)
    return preprocess_x, preprocess_y, x_tk, y_tk

preproc_english_sentences, preproc_quechua_sentences, english_tokenizer, quechua_tokenizer = preprocess(english_sentences, quechua_sentences)
    

In [7]:
def logits_to_text(logits, tokenizer):
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'
    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])
    print('`logits_to_text` function loaded.')

# Final Model

In [8]:
def model_final(input_shape, output_sequence_length, english_vocab_size, quechua_vocab_size):
  
    model = Sequential()
    model.add(Embedding(input_dim=english_vocab_size,output_dim=128,input_length=input_shape[1]))
    model.add(Bidirectional(GRU(256,return_sequences=False)))
    model.add(RepeatVector(output_sequence_length))
    model.add(Bidirectional(GRU(256,return_sequences=True)))
    model.add(TimeDistributed(Dense(quechua_vocab_size,activation='softmax')))
    learning_rate = 0.005
    
    model.compile(loss = sparse_categorical_crossentropy, 
                 optimizer = Adam(learning_rate), 
                 metrics = ['accuracy'])
    
    return model
print('Final Model Loaded')

Final Model Loaded


In [19]:
def final_predictions(x, y, x_tk, y_tk):
    tmp_X = pad(preproc_english_sentences)
    model = model_final(tmp_X.shape,
                        preproc_quechua_sentences.shape[1],
                        len(english_tokenizer.word_index)+1,
                        len(quechua_tokenizer.word_index)+1)
    
    model.fit(tmp_X, preproc_quechua_sentences, batch_size = 400, epochs = 5, validation_split = 0.2)
    
    fitted_model = model
    return fitted_model
   
fitted_model = final_predictions(preproc_english_sentences, preproc_quechua_sentences, english_tokenizer, quechua_tokenizer)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# Write fitted model as JSON
After fully training the model we can write an H5 file with all the ideal parameters which will then be loaded

In [10]:
fitted_model.save_weights('model_weights.h5')
json_string = fitted_model.to_json()
f = open("model_architecture.json",'w')
f.write(json_string)
f.close()

# Open fitted Model
We can open the fitted model using the protocol below

In [11]:
f = open("model_architecture.json",'r+')
json_string = f.read()
f.close()
model = model_from_json(json_string)
model.load_weights('model_weights.h5')
model.compile(loss=sparse_categorical_crossentropy, optimizer=Adam(0.005), metrics=['accuracy'])

# Full Prediction Workflow

In [12]:
def tokenize(x):
    x_tk = Tokenizer(char_level = False)
    x_tk.fit_on_texts(x)
    return x_tk.texts_to_sequences(x), x_tk

def pad(x, length=None):
    if length is None:
        length = max([len(sentence) for sentence in x])
    return pad_sequences(x, maxlen = length, padding = 'post')

def preprocess(x, y):
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)
    
    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)
# Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)
    return preprocess_x, preprocess_y, x_tk, y_tk

preproc_english_sentences, preproc_quechua_sentences, english_tokenizer, quechua_tokenizer = preprocess(english_sentences, quechua_sentences)
    
def logits_to_text(logits, tokenizer):
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'
    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])
    print('`logits_to_text` function loaded.') 

    
def run(English_Phrase):
    K.clear_session()
    rosetta_bible = pd.read_csv('french_and_english_vocab.csv')
    # English Sentences
    english_sentences = rosetta_bible.English
    # Quechua Sentences
    quechua_sentences = rosetta_bible.Quechua    
    preproc_english_sentences, preproc_quechua_sentences, english_tokenizer, quechua_tokenizer = preprocess(english_sentences, quechua_sentences)
    y_id_to_word = {value: key for key, value in quechua_tokenizer.word_index.items()}
    y_id_to_word[0] = '<PAD>'
    sentence = English_Phrase
#     sentence = 'his favorite fruit is orange'
    sentence = [english_tokenizer.word_index[word] for word in sentence.split()]
    sentence = pad_sequences([sentence], maxlen=preproc_english_sentences.shape[-1], padding='post')
    sentence
    sentences = np.array([sentence[0], preproc_english_sentences[0]])
    
    f = open("model_architecture.json",'r+')
    json_string = f.read()
    f.close()
    model = model_from_json(json_string)
    model.load_weights('model_weights.h5')
    model.compile(loss=sparse_categorical_crossentropy, optimizer=Adam(0.005), metrics=['accuracy'])         

    predictions = model.predict(sentences, len(sentences))
    final_pred = ' '.join([y_id_to_word[np.argmax(x)] for x in predictions[0]])
    final_pred = final_pred.replace("<PAD>","")
    return final_pred
    K.clear_session()

In [13]:
run('his favorite fruit is grapefruit')

'son fruit préféré est le pamplemousse               '

# BLEU Score Validation

In [63]:
index_list = [np.random.randint(len(rosetta_bible), size=10)]
empty_list_english = pd.DataFrame([])
empty_list_french = pd.DataFrame([])
for i in index_list:
    a = empty_list.append(rosetta_bible['English'][i])
    b = empty_list.append(rosetta_bible['Quechua'][i])

In [74]:
# These are the candidate english phrases
a.T

Unnamed: 0,English
59124,"paris is sometimes wonderful during december ,..."
77029,"paris is usually chilly during fall , but it i..."
31930,"the banana is their least favorite fruit , but..."
50956,"california is usually snowy during april , and..."
112496,"the lime is her most loved fruit , but the man..."
61308,"paris is usually freezing during september , b..."
91526,"the apple is my least liked fruit , but the st..."
101651,"new jersey is never rainy during winter , and ..."
78490,he drove the little yellow truck .
73234,"new jersey is usually relaxing during march , ..."


In [75]:
# These are the reference french phrases
b.T

Unnamed: 0,Quechua
59124,"paris est parfois merveilleux en décembre , ma..."
77029,"paris est généralement froid à l'automne , mai..."
31930,"la banane est leur fruit préféré moins , mais ..."
50956,californie est généralement enneigée en avril ...
112496,"la chaux est son fruit le plus aimé , mais la ..."
61308,"paris est le gel habituellement en septembre ,..."
91526,"la pomme est mon moins aimé des fruits , mais ..."
101651,new jersey est jamais pluvieux pendant l' hive...
78490,il a conduit le petit camion jaune .
73234,new jersey est relaxant habituellement en mars...


In [122]:
bleu_scores = list()
for i in range(10):
    mystring = np.array(a)[0][i]
    candidate = run(re.sub('\W+',' ', mystring)).split()
    mystring = np.array(b)[0][i]
    reference = re.sub('\W+',' ', mystring).split()
    score = corpus_bleu([[reference]], [candidate])
    print(candidate)
    print(reference)
    print(score)
    bleu_scores = bleu_scores + [score]

['paris', 'est', 'parfois', 'merveilleux', 'en', 'décembre', 'mais', 'il', 'est', 'relaxant', 'à', "l'", 'automne']
['paris', 'est', 'parfois', 'merveilleux', 'en', 'décembre', 'mais', 'il', 'est', 'relaxant', 'à', 'l', 'automne']
0.842362674378975
['paris', 'est', 'généralement', 'froid', 'à', "l'automne", 'mais', 'il', 'est', 'jamais', 'chaud', 'en', 'janvier']
['paris', 'est', 'généralement', 'froid', 'à', 'l', 'automne', 'mais', 'il', 'est', 'jamais', 'chaud', 'en', 'janvier']
0.7048050905062194
['la', 'banane', 'est', 'leur', 'fruit', 'préféré', 'moins', 'mais', "l'orange", 'est', 'son', 'moins', 'préféré']
['la', 'banane', 'est', 'leur', 'fruit', 'préféré', 'moins', 'mais', 'l', 'orange', 'est', 'son', 'moins', 'préféré']
0.7048050905062194
['californie', 'est', 'généralement', 'enneigée', 'en', 'avril', 'et', 'il', 'fait', 'froid', 'en', 'hiver']
['californie', 'est', 'généralement', 'enneigée', 'en', 'avril', 'et', 'il', 'fait', 'froid', 'en', 'hiver']
1.0
['la', 'chaux', 'est'

In [125]:
np.average(bleu_scores)

0.8918934748860664

# Gradio Interface

In [14]:
import gradio as gr
iface = gr.Interface(fn=run, 
                     inputs="text",
                     outputs="text", 
                     description = "This is a rudimentary English to French translator that uses Deep Learning \
                     (Bidirectional Recurrent Layers)",
                     title="English to French Translator")
iface.launch(share=True)

Running on local URL:  http://127.0.0.1:7869
Running on public URL: https://02259fd3-8385-4a42.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces




## Summary of Results
- Final Model Results (Avg. BLEU Score) ~0.89
- Google Cloud's Translation has an accuracy of almost 100%. Thus our final Encoder-Decoder has proven to be almost as effective.