In [1]:
import tensorflow as tf
physical_devices = tf.config.experimental.list_physical_devices('GPU')
assert len(physical_devices) > 0, "Not enough GPU hardware devices available"
config = tf.config.experimental.set_memory_growth(physical_devices[0], True)

from tensorflow.keras import  models, optimizers, layers, activations
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, LSTM, RNN, GRU, Dense, Embedding
import matplotlib.pyplot as plt
import numpy as np
import wandb
from wandb.keras import WandbCallback

In [2]:
#set of hyperparameters to be tuned during the sweep

default_parameters = dict(
    embedding_size = 32,
    batch_size = 32,
    num_enc_layers = 3,
    num_dec_layers = 3,
    hidden_layer_size = 64,
    cell_type = 'LSTM',
    dropout = 0.2,
    recurrent_dropout = 0.2,
    epochs = 10
    )

In [3]:
# wandb login 
run = wandb.init(config=default_parameters, project="CS6910_Assignment3", entity="arneshbose1")
config = wandb.config

wandb: Currently logged in as: arnesh_neil (use `wandb login --relogin` to force relogin)
wandb: wandb version 0.10.27 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


In [4]:
# path to the train, validation and test dataset

train_path = 'dakshina_dataset_v1.0\hi\lexicons\hi.translit.sampled.train.tsv'
val_path = 'dakshina_dataset_v1.0\hi\lexicons\hi.translit.sampled.dev.tsv'
test_path = 'dakshina_dataset_v1.0\hi\lexicons\hi.translit.sampled.test.tsv'

# creating the corpus and vectorizing the data

train_X = []
train_Y = []
input_corpus = set()
output_corpus = set()

with open(train_path, "r", encoding="utf-8") as f:
    lines = f.read().split("\n")
    
for line in lines[:len(lines) - 1]:
    target_text, input_text, _ = line.split("\t")
    #using "tab" as the "start sequence" character for the targets, and "\n" as "end sequence" character.
    target_text = "\t" + target_text + "\n"
    train_X.append(input_text)
    train_Y.append(target_text)
    for char in input_text:
        input_corpus.add(char)
    for char in target_text:
        output_corpus.add(char)

# ' ' is used to fill the empty spaces of shorter sequences
input_corpus.add(" ")
output_corpus.add(" ")
input_corpus = sorted(list(input_corpus))
output_corpus = sorted(list(output_corpus))
num_encoder_tokens = len(input_corpus)
num_decoder_tokens = len(output_corpus)
max_encoder_seq_length = max([len(txt) for txt in train_X])
max_decoder_seq_length = max([len(txt) for txt in train_Y])

In [5]:
val_X = []
val_Y = []
with open(val_path, "r", encoding="utf-8") as f:
    lines = f.read().split("\n")
    
for line in lines[:len(lines) - 1]:
    target_text, input_text, _ = line.split("\t")
    target_text = "\t" + target_text + "\n"
    val_X.append(input_text)
    val_Y.append(target_text)

In [6]:
print("Number of samples:", len(train_X))
print("Number of unique input tokens:", num_encoder_tokens)
print("Number of unique output tokens:", num_decoder_tokens)
print("Max sequence length for inputs:", max_encoder_seq_length)
print("Max sequence length for outputs:", max_decoder_seq_length)

Number of samples: 44204
Number of unique input tokens: 27
Number of unique output tokens: 66
Max sequence length for inputs: 20
Max sequence length for outputs: 21


In [7]:
input_char_index = dict([(char, i) for i, char in enumerate(input_corpus)])
output_char_index = dict([(char, i) for i, char in enumerate(output_corpus)])

encoder_input_data = np.zeros((len(train_X), max_encoder_seq_length, num_encoder_tokens), dtype="float32")
decoder_input_data = np.zeros((len(train_X), max_decoder_seq_length, num_decoder_tokens), dtype="float32")
decoder_target_data = np.zeros((len(train_X), max_decoder_seq_length, num_decoder_tokens), dtype="float32")

for i, (x, y) in enumerate(zip(train_X, train_Y)):
    for t, char in enumerate(x):
        encoder_input_data[i, t, input_char_index[char]] = 1.0
        
    encoder_input_data[i, t + 1 :, input_char_index[" "]] = 1.0
    
    for t, char in enumerate(y):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, output_char_index[char]] = 1.0
        if t > 0:
            decoder_target_data[i, t - 1, output_char_index[char]] = 1.0
            
    decoder_input_data[i, t + 1 :, output_char_index[" "]] = 1.0
    decoder_target_data[i, t:, output_char_index[" "]] = 1.0

In [8]:
encoder_input_data_val = np.zeros((len(val_X), max_encoder_seq_length, num_encoder_tokens), dtype="float32")
decoder_input_data_val = np.zeros((len(val_X), max_decoder_seq_length, num_decoder_tokens), dtype="float32")
decoder_target_data_val = np.zeros((len(val_X), max_decoder_seq_length, num_decoder_tokens), dtype="float32")

for i, (x, y) in enumerate(zip(val_X, val_Y)):
    for t, char in enumerate(x):
        encoder_input_data_val[i, t, input_char_index[char]] = 1.0
        
    encoder_input_data_val[i, t + 1 :, input_char_index[" "]] = 1.0
    
    for t, char in enumerate(y):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data_val[i, t, output_char_index[char]] = 1.0
        if t > 0:
            decoder_target_data_val[i, t - 1, output_char_index[char]] = 1.0
            
    decoder_input_data_val[i, t + 1 :, output_char_index[" "]] = 1.0
    decoder_target_data_val[i, t:, output_char_index[" "]] = 1.0

In [12]:
def training_model(num_enc_layers,num_dec_layers, hidden_layer_size, cell_type, dropout, recurrent_dropout,
                   num_encoder_tokens,num_decoder_tokens):
    
    if cell_type == 'LSTM':
        encoder_inputs = Input(shape=(None, num_encoder_tokens))
        x_e = encoder_inputs

        for i in range(num_enc_layers-1):
            x_e = LSTM(hidden_layer_size, return_state=True, return_sequences=True)(x_e)

        encoder_outputs, state_h, state_c = LSTM(hidden_layer_size, return_state=True)(x_e)

        encoder_states = [state_h, state_c]

        decoder_inputs = Input(shape=(None, num_decoder_tokens))
        x_d = decoder_inputs

        x_d = LSTM(hidden_layer_size, return_sequences=True, return_state=True)(x_d,initial_state=encoder_states)
        for i in range(num_dec_layers-1):
            x_d = LSTM(hidden_layer_size, return_sequences=True, return_state=True)(x_d)

        decoder_outputs, _, _ = x_d
        decoder_dense = Dense(num_decoder_tokens, activation="softmax")
        decoder_outputs = decoder_dense(decoder_outputs)

        model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    
    return model

In [13]:
embedding_size = config.embedding_size
batch_size = config.batch_size
num_enc_layers = config.num_enc_layers
num_dec_layers = config.num_dec_layers
hidden_layer_size = config.hidden_layer_size
cell_type = config.cell_type
dropout = config.dropout
recurrent_dropout = config.recurrent_dropout
epochs = config.epochs

In [14]:
model = training_model(num_enc_layers,num_dec_layers, hidden_layer_size, cell_type, dropout, recurrent_dropout,
                   num_encoder_tokens,num_decoder_tokens)
model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])
model.fit([encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=([encoder_input_data_val, decoder_input_data_val],decoder_target_data_val),
    callbacks=[WandbCallback()]
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x2732260a7c0>

In [15]:
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, None, 27)]   0                                            
__________________________________________________________________________________________________
lstm_5 (LSTM)                   [(None, None, 64), ( 23552       input_3[0][0]                    
__________________________________________________________________________________________________
lstm_6 (LSTM)                   [(None, None, 64), ( 33024       lstm_5[0][0]                     
                                                                 lstm_5[0][1]                     
                                                                 lstm_5[0][2]                     
_______________________________________________________________________________________

In [21]:
encoder_inputs = model.input[0]  # input_1
encoder_outputs, state_h_enc, state_c_enc = model.layers[4].output  # lstm_1
encoder_states = [state_h_enc, state_c_enc]
encoder_model = Model(encoder_inputs, encoder_states)

decoder_inputs = model.input[1]  # input_2
decoder_state_input_h = Input(shape=(hidden_layer_size,), name="input_3")
decoder_state_input_c = Input(shape=(hidden_layer_size,), name="input_5")
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_lstm_1 = model.layers[5]
decoder_lstm_2 = model.layers[6]
decoder_lstm_3 = model.layers[7]
decoder_outputs, state_h_dec, state_c_dec = decoder_lstm_3(decoder_lstm_2(decoder_lstm_1(decoder_inputs, initial_state=decoder_states_inputs)))
decoder_states = [state_h_dec, state_c_dec]
decoder_dense = model.layers[8]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states
)

# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict((i, char) for char, i in input_char_index.items())
reverse_target_char_index = dict((i, char) for char, i in output_char_index.items())

In [24]:
encoder_model.summary()

Model: "functional_9"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, None, 27)]   0                                            
__________________________________________________________________________________________________
lstm_5 (LSTM)                   [(None, None, 64), ( 23552       input_3[0][0]                    
__________________________________________________________________________________________________
lstm_6 (LSTM)                   [(None, None, 64), ( 33024       lstm_5[0][0]                     
                                                                 lstm_5[0][1]                     
                                                                 lstm_5[0][2]                     
_______________________________________________________________________________________

In [25]:
decoder_model.summary()

Model: "functional_11"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, None, 66)]   0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 64)]         0                                            
__________________________________________________________________________________________________
input_5 (InputLayer)            [(None, 64)]         0                                            
__________________________________________________________________________________________________
lstm_8 (LSTM)                   [(None, None, 64), ( 33536       input_4[0][0]                    
                                                                 input_3[0][0]        

In [22]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, output_char_index["\t"]] = 1.0

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ""
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if sampled_char == "\n" or len(decoded_sentence) > max_decoder_seq_length:
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.0

        # Update states
        states_value = [h, c]
    return decoded_sentence

In [23]:
for seq_index in range(20):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index : seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print("-")
    print("Input sentence:", train_X[seq_index])
    print("Decoded sentence:", decoded_sentence)


-
Input sentence: an
Decoded sentence: ्ंभ््  ोोट्यांघरांभ्  
-
Input sentence: ankganit
Decoded sentence: ्ंत् ांत्र

-
Input sentence: uncle
Decoded sentence: ्ंभ््

-
Input sentence: ankur
Decoded sentence: ्डग्राांत् म्ओभ्ांत्  
-
Input sentence: ankuran
Decoded sentence: ्लग्रा

-
Input sentence: ankurit
Decoded sentence: ्लग्रा

-
Input sentence: aankush
Decoded sentence: ्एभ्एमाणरांभ् माोगराीग
-
Input sentence: ankush
Decoded sentence: ्ंत्  ोंत्आ

-
Input sentence: ang
Decoded sentence: ्ंभ््

-
Input sentence: anga
Decoded sentence: ्ंभ््

-
Input sentence: agandh
Decoded sentence: ्डल्र

-
Input sentence: angad
Decoded sentence: ्डग्राोंत् ोंत्ी

-
Input sentence: angane
Decoded sentence: ्ंभ््

-
Input sentence: angbhang
Decoded sentence: ीगेत्

-
Input sentence: angarakshak
Decoded sentence: ीगृथाोगराोगधाांभराीगरा
-
Input sentence: angrakshak
Decoded sentence: ीग्ठा्लराोगराीगध्ा

-
Input sentence: angara
Decoded sentence: ्डग्राांध्ा

-
Input sentence: angaare
Decoded sente