Machine Learning Model To Translate English To Spanish

In [1]:
import numpy as np 
import pandas as pd 
import tensorflow as tf
import collections
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras.layers import GRU ,Embedding, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional, LSTM, Dropout
from keras.optimizers import Adam 
from keras.losses import sparse_categorical_crossentropy
from keras.callbacks import ModelCheckpoint


In [2]:
tf.config.experimental.list_logical_devices()

[LogicalDevice(name='/device:CPU:0', device_type='CPU')]

In [3]:
data = pd.read_csv("D:\DML\English-French\eng_-french.csv")
print(data.head())

  English words/sentences French words/sentences
0                     Hi.                 Salut!
1                    Run!                Cours !
2                    Run!               Courez !
3                    Who?                  Qui ?
4                    Wow!             Ça alors !


In [4]:
data.isnull().sum()

English words/sentences    0
French words/sentences     0
dtype: int64

In [5]:
english = data["English words/sentences"]
french = data["French words/sentences"]


Words Counter in English and French 

In [6]:
EnglishWordsCounter = collections.Counter([word for sentence in english for word in sentence.split()])
FrenchWordsCounter = collections.Counter([word for sentence in french for word in sentence.split()])

In [7]:
print('English Words: {}'.format(len([word for sentence in english for word in sentence.split()])))
print('English Unique Words: {}'.format(len(EnglishWordsCounter)))
print('20 Most Commons Words in English data: ')
print('"' + '" "'.join(list(zip(*EnglishWordsCounter.most_common(20)))[0]) + '"')
print()

print('French Words: {}'.format(len([word for sentence in french for word in sentence.split()])))
print('French Unique Words: {}'.format(len(FrenchWordsCounter)))
print('20 Most Commons Words in French data: ')
print('"' + '" "'.join(list(zip(*FrenchWordsCounter.most_common(20)))[0]) + '"')
print()

English Words: 1082098
English Unique Words: 27393
20 Most Commons Words in English data: 
"I" "to" "you" "the" "a" "is" "Tom" "of" "in" "have" "was" "that" "He" "I'm" "for" "don't" "do" "You" "your" "be"

French Words: 1177832
French Unique Words: 44918
20 Most Commons Words in French data: 
"de" "Je" "?" "pas" "que" "à" "ne" "la" "le" "Il" "Tom" "est" "vous" "un" "a" "ce" "en" "une" "me" "je"



Tokenization 

In [8]:
# Tokenization divides the original text into tokens, which are words and sentences.
def TokenFunction(t):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(t)
    return tokenizer.texts_to_sequences(t), tokenizer

In [9]:
## pad_sequences to ensure that all sequences have the same the length
def padSequences(t, length=None):
    if length is None:
        length = max([len(sentence) for sentence in t])
    return pad_sequences(t, maxlen = 55, padding = 'post')

In [10]:
def preprocessFunction(x,y):
    preprocessX, tokenX = TokenFunction(x)
    preprocessY, tokenY = TokenFunction(y)
    
    preprocessX = padSequences(preprocessX)
    preprocessY = pad_sequences(preprocessY)
    preprocessY = preprocessY.reshape(*preprocessY.shape, 1)
    
    return preprocessX,preprocessY, tokenX,tokenY

In [11]:
preprocessEnglishSentences, preprocessFrenchSentences, englishTokenizer, frenchTokenizer = preprocessFunction(english,french)

In [12]:
maxEnglishSequenceLength = preprocessEnglishSentences.shape[1]
maxFrenchSequenceLength = preprocessFrenchSentences.shape[1]

englishVocabSize = len(englishTokenizer.word_index)
frenchVocabSize = len(frenchTokenizer.word_index)

print("Maximum English Sentence Length: ",maxEnglishSequenceLength)
print("Maximum French Sentence Length: ",maxFrenchSequenceLength)

print("English Vocabulary Size: ", englishVocabSize)
print("French Vocabulary Size: ", frenchVocabSize)

Maximum English Sentence Length:  55
Maximum French Sentence Length:  55
English Vocabulary Size:  14531
French Vocabulary Size:  30660


Converting Final Prediction Into Text Form

In [13]:
def textConverter(index, tokenizer):
    indexTOWords = {id: word for word, id in tokenizer.word_index.items()}
    indexTOWords[0] = 'CONVERTER'
    return ' '.join([indexTOWords[prediction] for prediction in np.argmax(index, 1)])

 Bidirectional RNN Model With Embeddings

In [22]:
def BidirectionalModel(inputShape, outputSequenceLength, englishVocabSize, frenchVocabSize):
    learningRate = 0.003
    
    #Layers
    model = Sequential()
    model.add(Embedding(frenchVocabSize,256,input_length=inputShape[1], input_shape=inputShape[1:]))
    model.add(Bidirectional(GRU(256, return_sequences=True)))
    model.add(TimeDistributed(Dense(1024, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(englishVocabSize, activation='softmax')))
    
    model.compile(loss = sparse_categorical_crossentropy, optimizer= Adam(learningRate), metrics=['Accuracy'])
    return model

In [15]:
preprocessEnglishSentences.shape

(175621, 55)

In [23]:
#Reshape the input
inputReshape = padSequences(preprocessFrenchSentences, preprocessFrenchSentences.shape[1])
inputReshape = inputReshape.reshape((-1, preprocessFrenchSentences.shape[-2]))

#Train  Model
model = BidirectionalModel(inputReshape.shape, preprocessEnglishSentences.shape[1],len(englishTokenizer.word_index)+1,len(frenchTokenizer.word_index)+1)
model.summary()



Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 55, 256)           7849216   
                                                                 
 bidirectional_5 (Bidirectio  (None, 55, 512)          789504    
 nal)                                                            
                                                                 
 time_distributed_10 (TimeDi  (None, 55, 1024)         525312    
 stributed)                                                      
                                                                 
 dropout_5 (Dropout)         (None, 55, 1024)          0         
                                                                 
 time_distributed_11 (TimeDi  (None, 55, 14532)        14895300  
 stributed)                                                      
                                                      

In [1]:
with tf.device('/CPU:0'):
    model.fit(inputReshape, preprocessEnglishSentences, batch_size=0, epochs=3, validation_split=0.2)

NameError: name 'tf' is not defined

In [None]:
def Converter(text, tokenizer):
     index_to_words = {id: word for word, id in tokenizer.word_index.items()}
     index_to_words[0] = '<CONVERTER>'

     return ' '.join([index_to_words[prediction] for prediction in np.argmax(text, 1)])