In [19]:
import tensorflow as tf
import collections
import os
import pandas as pd
import numpy as np

In [20]:
def load_data(path):
    input_file = os.path.join(path)
    with open(input_file,'r') as f:
        data=f.read()
    
    return data.split('\n')

In [21]:
eng_sentences= load_data('small_vocab_en.txt')
french_sentences = load_data('small_vocab_fr.txt')

In [52]:
print(eng_sentences[10])
print(french_sentences[10])

the lime is her least liked fruit , but the banana is my least liked .
la chaux est son moins aimÃ© des fruits , mais la banane est mon moins aimÃ©.


In [23]:
import collections

eng_words= collections.Counter([word for sentence in eng_sentences for word in sentence.split()])
french_words= collections.Counter([word for sentence in french_sentences for word in sentence.split()])

print('Total English Words : {}'.format(len([word for sentence in eng_sentences for word in sentence.split()])))
print('Total French Words : {}'.format(len([word for sentence in french_sentences for word in sentence.split()])))

Total English Words : 1823250
Total French Words : 1961295


In [32]:
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional,RepeatVector,GRU,TimeDistributed
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy

from tensorflow.keras import regularizers

In [33]:
def tokenize(x):
    
    tokenizer =Tokenizer(num_words=None,char_level=False)
    tokenizer.fit_on_texts(x)
    sequences =tokenizer.texts_to_sequences(x)
    return sequences,tokenizer

def pad(x,length=None):
    pad=pad_sequences(x,maxlen=length,padding='post',truncating='post')
    return pad


In [34]:
def preprocess(x,y):
    preprocess_x,x_tok=tokenize(x)
    preprocess_y,y_tok=tokenize(y)
    
    preprocess_x= pad(preprocess_x)
    preprocess_y = pad(preprocess_y)
    
    
    preprocess_y=preprocess_y.reshape(*preprocess_y.shape,1)
    
    return preprocess_x,preprocess_y,x_tok,y_tok

In [35]:
preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer =preprocess(eng_sentences, french_sentences)

In [36]:
print(preproc_english_sentences.shape)
print(preproc_french_sentences.shape)

(137861, 15)
(137861, 21, 1)


In [37]:
def modelB(input_shape,output_shape,english_vocab_size,french_vocab_size):
    
    model = Sequential()
    model.add(Embedding(english_vocab_size,output_shape,input_length=input_shape[1:][0]))
    model.add(Bidirectional(GRU(english_vocab_size, return_sequences=False),input_shape=input_shape[1:]))
    model.add(Dense(french_vocab_size,activation='relu'))
    model.add(RepeatVector(output_shape))
    model.add(Bidirectional(GRU(english_vocab_size,return_sequences= True)))
    model.add(TimeDistributed(Dense(french_vocab_size,activation = 'softmax')))
    
    model.compile(loss=sparse_categorical_crossentropy,optimizer = Adam(10e-3),metrics=['acc'])
    
    return model    

In [57]:
def predictions(x,y,x_tok,y_tok):
    
    model= modelB(x.shape,y.shape[1],len(x_tok.word_index)+1,len(y_tok.word_index)+1)
    
    model.summary()
    
    model.fit(x,y,batch_size=1024,epochs=20,validation_split=0.2)
    
    return model


In [58]:
model =predictions(preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer)


y_id_to_word = {value: key for key, value in french_tokenizer.word_index.items()}
y_id_to_word[0] = '<PAD>'

sentence = 'india is rainy during june  and it is sometimes warm in november '
sentence = [english_tokenizer.word_index[word] for word in sentence.split()]
sentence = pad_sequences([sentence], maxlen=preproc_english_sentences.shape[-1], padding='post')
sentences = np.array([sentence[0]])
predictions = model.predict(sentences, len(sentences))
print(' '.join([y_id_to_word[np.argmax(x)] for x in predictions[0]]))


Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 15, 21)            4200      
_________________________________________________________________
bidirectional_26 (Bidirectio (None, 400)               267600    
_________________________________________________________________
dense_26 (Dense)             (None, 346)               138746    
_________________________________________________________________
repeat_vector_13 (RepeatVect (None, 21, 346)           0         
_________________________________________________________________
bidirectional_27 (Bidirectio (None, 21, 400)           657600    
_________________________________________________________________
time_distributed_13 (TimeDis (None, 21, 346)           138746    
Total params: 1,206,892
Trainable params: 1,206,892
Non-trainable params: 0
___________________________________________