In [1]:
# To mount Google drive on Google Colab environment
from google.colab import drive
drive.mount('/content/drive')
root = '/content/drive/My Drive/English Dataset'
# root = '.'

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


### 1. Packages

In [1]:
import pandas as pd
import numpy as np
import string
import re
import pickle
import os
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model
from keras.callbacks import ModelCheckpoint
from string import digits

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### 2. Data Preparation

In [2]:
# Read dataset
lines = pd.read_pickle(os.path.join(root, 'mar-eng_cleaned.parallel'))

NameError: name 'root' is not defined

In [None]:
# View the shape of dataset
lines.shape

In [None]:
# Add 'start' and 'end' tokens to target sentences
lines.Mar = lines.Mar.apply(lambda x: '<START> ' + x + ' <END>')

In [None]:
# View a few samples of the dataset
lines.sample(5)

In [None]:
# Get vocabulary and embeddings
with open(os.path.join(root, 'embeddings.en'), 'rb') as f:
    english_summary = pickle.load(f)
    
with open(os.path.join(root, 'embeddings.ma'), 'rb') as f:
    marathi_summary = pickle.load(f)

In [None]:
# Add start and end tokens to dictionary
for word in ['<START>', '<END>']:
    l = len(marathi_summary['dictionary'].keys())
    marathi_summary['dictionary'][word] = l
    marathi_summary['reverse_dictionary'][l] = word
    marathi_summary['embeddings'] = np.vstack((marathi_summary['embeddings'], np.zeros((1, marathi_summary['embeddings'].shape[1]))))

In [None]:
# English vocabulary
all_eng_words = set(list(english_summary['dictionary'].keys()))
        
# Marathi vocabulary
all_mar_words = set(list(marathi_summary['dictionary'].keys()))

In [None]:
# Max length of source sequence
max_length_src = 0

for line in lines.Eng:
    if len(line.split(' ')) > max_length_src:
        max_length_src = len(line.split(' '))
        
max_length_src

In [None]:
# Max length of target sequence
max_length_tar = 0

for line in lines.Mar:
    if len(line.split(' ')) > max_length_tar:
        max_length_tar = len(line.split(' '))
        
max_length_tar

In [None]:
num_encoder_tokens = len(all_eng_words)
num_decoder_tokens = len(all_mar_words)
num_encoder_tokens, num_decoder_tokens

In [None]:
source_dictionary = english_summary['dictionary']
target_dictionary = marathi_summary['dictionary']

In [None]:
source_reverse_dictionary = english_summary['reverse_dictionary']
target_reverse_dictionary = marathi_summary['reverse_dictionary']

In [None]:
lines = shuffle(lines)
lines.head(10)

### 3. Batch Generator

In [None]:
def encode_input(X):
    """
        X = batch of inputs
    """
    # Get the batch_size
    batch_size = len(X)
    
    # Create a numpy array of zeros to hold input
    encoder_input_data = np.zeros((batch_size, max_length_src), dtype='float32')
    
    for i, input_text in enumerate(X):
        for t, word in enumerate(input_text.split()):
            if word not in source_dictionary.keys():
                word = 'UNK'
            encoder_input_data[i, t] = source_dictionary[word]
            
    return encoder_input_data

def encode_target(y):
    """
        y = batch of outputs
    """
    # Get the batch_size
    batch_size = len(y)
    
    # Create numpy arrays of zeros to hold encoded targets
    decoder_input_data = np.zeros((batch_size, max_length_tar), dtype='float32')
    decoder_target_data = np.zeros((batch_size, max_length_tar, num_decoder_tokens), dtype='float32')
    
    for i, target_text in enumerate(y):
        for t, word in enumerate(target_text.split()):
            if t < len(target_text.split()) - 1:
                decoder_input_data[i, t] = target_dictionary[word]
                
            if t > 0:
                decoder_target_data[i, t-1, target_dictionary[word]] = 1.0
                
    return decoder_input_data, decoder_target_data

In [None]:
def generate_batch(X, y, batch_size=128):
    """
        X = Source dataset
        y = Target dataset
        batch_size = Size of each batch
    """
    while True:
        for j in range(0, len(X), batch_size):
            encoder_input_data = encode_input(X[j:j+batch_size])
            decoder_input_data, decoder_target_data = encode_target(y[j:j+batch_size])
            
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

### 4. Encoder - Decoder Model Architecture

In [None]:
# Train-test split
X, y = lines.Eng, lines.Mar
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
X_train.shape, X_test.shape

In [None]:
latent_dim = 128

In [None]:
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 128
epochs = 50

#### 4.1 Encoder

In [None]:
# Inputs
encoder_inputs = Input(shape=(None, ), name='Encoder_Inputs')

# Embedding Lookup
encoder_embedding_layer = Embedding(num_encoder_tokens, latent_dim, mask_zero=True, 
                                    weights=[english_summary['embeddings']], 
                                    name='English_Embedding_Layer')
encoder_embeddings = encoder_embedding_layer(encoder_inputs)

# LSTM
encoder_lstm = LSTM(latent_dim, return_state=True, name='Encoder_LSTM')
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embeddings)

# Keeping only the states and discarding encoder outputs
encoder_states = [state_h, state_c]

#### 4.2 Decoder

In [0]:
# Inputs
decoder_inputs = Input(shape=(None, ), name='Decoder_Inputs')

# Embedding
decoder_embedding_layer = Embedding(num_decoder_tokens, latent_dim, mask_zero=True, 
                                    weights=[marathi_summary['embeddings']], 
                                    name='Marathi_Embedding_Layer')
decoder_embeddings = decoder_embedding_layer(decoder_inputs)

# LSTM
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, name='Decoder_LSTM')
decoder_outputs, _, _ = decoder_lstm(decoder_embeddings, initial_state=encoder_states)

# Dense output layer
decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='Decoder_Dense')
decoder_outputs = decoder_dense(decoder_outputs)

In [0]:
# Define a model with these layers
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [24]:
# Take a look at the model
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Encoder_Inputs (InputLayer)     (None, None)         0                                            
__________________________________________________________________________________________________
Decoder_Inputs (InputLayer)     (None, None)         0                                            
__________________________________________________________________________________________________
English_Embedding_Layer (Embedd (None, None, 128)    1117568     Encoder_Inputs[0][0]             
__________________________________________________________________________________________________
Marathi_Embedding_Layer (Embedd (None, None, 128)    1625344     Decoder_Inputs[0][0]             
__________________________________________________________________________________________________
Encoder_LS

In [0]:
# Compile the model
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])

In [0]:
model.load_weights(os.path.join(root, 'nmt_weights_en_hi_ntl_e.h5'))

In [0]:
# Create checkpoints to save model from time to time
filepath = os.path.join(root, 'best_model_en_ma_tl_e.hdf5')
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

In [0]:
model.fit_generator(generator=generate_batch(X_train, y_train, batch_size), steps_per_epoch=train_samples//batch_size, 
                    epochs=epochs, validation_data=generate_batch(X_test, y_test, batch_size), 
                    validation_steps=val_samples//batch_size, callbacks=callbacks_list)

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Epoch 1/50

Epoch 00001: val_acc improved from -inf to 0.22455, saving model to /content/drive/My Drive/English Dataset/best_model_en_ma_ntl_e.hdf5


  '. They will not be included '


Epoch 2/50

Epoch 00002: val_acc improved from 0.22455 to 0.28387, saving model to /content/drive/My Drive/English Dataset/best_model_en_ma_ntl_e.hdf5
Epoch 3/50

Epoch 00003: val_acc improved from 0.28387 to 0.32084, saving model to /content/drive/My Drive/English Dataset/best_model_en_ma_ntl_e.hdf5
Epoch 4/50

Epoch 00004: val_acc improved from 0.32084 to 0.35588, saving model to /content/drive/My Drive/English Dataset/best_model_en_ma_ntl_e.hdf5
Epoch 5/50

Epoch 00005: val_acc improved from 0.35588 to 0.38164, saving model to /content/drive/My Drive/English Dataset/best_model_en_ma_ntl_e.hdf5
Epoch 6/50

Epoch 00006: val_acc improved from 0.38164 to 0.40951, saving model to /content/drive/My Drive/English Dataset/best_model_en_ma_ntl_e.hdf5
Epoch 7/50

Epoch 00007: val_acc improved from 0.40951 to 0.43289, saving model to /content/drive/My Drive/English Dataset/best_model_en_ma_ntl_e.hdf5
Epoch 8/50

Epoch 00008: val_acc improved from 0.43289 to 0.45126, saving model to /content/dr

<keras.callbacks.History at 0x7fe011426b70>

#### 4.3 Save Model

In [0]:
model.save_weights(os.path.join(root, 'nmt_weights_en_ma_tl_e.h5'))

#### 4.4 Load model

In [0]:
model.load_weights(os.path.join(root, 'nmt_weights_en_ma_tl_e.h5'))

### 5. Inference Setup

In [0]:
# Encoder-decoder model that uses trained weights from the original model to make predictions

#### 5.1 Inference Encoder

In [0]:
# Encoder model to create a thought vector from the input
inference_encoder = Model(encoder_inputs, encoder_states)

#### 5.2 Inference Decoder

In [0]:
# For each time step, the decoder states from previous timestep would act as inputs
decoder_state_input_h = Input(shape=(latent_dim, ), name='Inference_Decoder_Output')
decoder_state_input_c = Input(shape=(latent_dim, ), name='Inference_Decoder_Memory')
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

# Embedding
decoder_embeddings_inference = decoder_embedding_layer(decoder_inputs)

# LSTM
decoder_outputs_inference, state_h_inference, state_c_inference = decoder_lstm(decoder_embeddings_inference, 
                                                                               initial_state=decoder_states_inputs)
decoder_states_inference = [state_h_inference, state_c_inference]

# Dense
decoder_outputs_inference = decoder_dense(decoder_outputs_inference)

In [0]:
# Decoder model
inference_decoder = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs_inference] + decoder_states_inference
)

In [31]:
inference_encoder.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Encoder_Inputs (InputLayer)  (None, None)              0         
_________________________________________________________________
English_Embedding_Layer (Emb (None, None, 128)         1117568   
_________________________________________________________________
Encoder_LSTM (LSTM)          [(None, 128), (None, 128) 131584    
Total params: 1,249,152
Trainable params: 1,249,152
Non-trainable params: 0
_________________________________________________________________


In [32]:
inference_decoder.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Decoder_Inputs (InputLayer)     (None, None)         0                                            
__________________________________________________________________________________________________
Marathi_Embedding_Layer (Embedd (None, None, 128)    1625344     Decoder_Inputs[0][0]             
__________________________________________________________________________________________________
Inference_Decoder_Output (Input (None, 128)          0                                            
__________________________________________________________________________________________________
Inference_Decoder_Memory (Input (None, 128)          0                                            
__________________________________________________________________________________________________
Decoder_LS

#### 5.3 Decode sample sequeces

In [0]:
def decode_sequence(input_sequence):
    # Get thought vector by encoding the input sequence
    states_value = inference_encoder.predict(input_sequence)
    
    # Generate target sequence initialized with <START> character
    target_sequence = np.zeros((1, 1))
    target_sequence[0, 0] = target_dictionary['<START>']
    
    # To stop the recurrent loop
    stop_condition = False
    
    # Final sentence
    decoded_sentence = ''
    
    while not stop_condition:
        # Get next prediction
        output_tokens, h, c = inference_decoder.predict([target_sequence] + states_value)
        
        # Get the token with max probability
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = target_reverse_dictionary[sampled_token_index]
        decoded_sentence += ' ' + sampled_word
        
        # Test for exit condition
        if (sampled_word == '<END>') or (len(decoded_sentence) > 50):
            stop_condition = True
            
        # Update the target sequence with current prediction
        target_sequence = np.zeros((1, 1))
        target_sequence[0, 0] = sampled_token_index
        
        # Update states
        states_value = [h, c]
    return decoded_sentence

### 6. Evaluation on Train Dataset

In [43]:
input_sequence = encode_input([''])
decoded_sentence = decode_sequence(input_sequence)
' '.join(decoded_sentence.split()[:-1])

'माझं नाव आहे'