In [1]:
import os

root_logdir = os.path.join(os.curdir, "my_logs")


def get_run_logdir():
    import time

    run_id = time.strftime("run_%Y_%m_%d-%H_%M_%S")
    return os.path.join(root_logdir, run_id)


run_logdir = get_run_logdir()

import tensorflow.keras as keras

tensorboard_cb = keras.callbacks.TensorBoard(run_logdir)

In [2]:
# https://stackabuse.com/python-for-nlp-neural-machine-translation-with-seq2seq-in-keras/
# https://web.stanford.edu/class/archive/cs/cs224n/cs224n.1194/reports/custom/15844304.pdf

In [3]:
# !pip install tensorflow_text

In [4]:
import pandas as pd
import warnings


warnings.filterwarnings("ignore")


class Train_Data:

    """
    Train_Data class Loads the data from multiple Training JSON files in Pandas Dataframes
    """

    def __init__(self, path, filenames):
        """
        Parameters
        ------------
        path: directory where English to SQL translation JSON are placed
        filenames: Json file names containing Train data
        """
        self.path = path
        self.filenames = filenames
        self.df_train = pd.DataFrame()
        for f in self.filenames:
            print("Reading file at path", self.path + f)
            try:
                df = pd.read_json(self.path + f)
                if len(self.df_train) == 0:
                    self.df_train = df
                else:
                    self.df_train = self.df_train.append(df)
                print("{} Rows in Total".format(len(self.df_train)))
            except Exception as e:
                print("Got error while Reading file : ", e)

    @property
    def questions(self):
        """
        Returns
        ------------
        Returns English Questions in Dataframe Rows as List
        """
        return self.df_train.question.values.tolist()

    @property
    def sql(self):
        """
        Returns
        ------------
        Returns SQL in Dataframe Rows as List
        """
        return self.df_train["query"].values.tolist()

    @property
    def question_tokens(self):
        """
        Returns
        ------------
        Returns English Question Tokens in Dataframe Rows as List
        """

        return self.df_train["question_toks"].values.tolist()

    @property
    def sql_tokens(self):
        """
        Returns
        ------------
        Returns SQL Query Tokens in Dataframe Rows as List
        """
        return self.df_train["query_toks"].values.tolist()

    def get_special_characters(self, list_of_text):
        """
        Parameters
        ------------
        list_of_text: Input List of Text
        Returns
        ------------
        Provides list of Special Characters in the text
        """
        return list(
            set(
                Preprocess.special_char("".join(["".join(ele) for ele in list_of_text]))
            )
        )

    def get_vocab_size(self, list_of_text):
        """
        Parameters
        ------------
        list_of_text: Input List of Text

        Returns
        ------------
        Vocabulary size or unique words in the corpus
        """
        word_list = []
        for sentence in list_of_text:
            for word in sentence.split():
                word = word.lower().strip()
                if word not in word_list:
                    word_list.append(word)
        return len(word_list), word_list


#################################
# CONSTANTS

EOS = "[END]"
SOS = "[START]"


#################################################################################################################

import re
import tensorflow as tf
import tensorflow_text as tf_text


class Preprocess:

    """
    Preprocess class cleans and standardize the data, add SOS-EOS tags to the data
    """

    def __init__(self, text):
        """
        Parameters
        ------------
        text : Input string
        Runs the text processing steps

        """
        self.processed_text = self.run_pipeline(text)

    def text_standardize(self, text):
        """
        Parameters
        ------------
        text : Input string

        Returns
        ------------
        -   Unicode normalization using NFKD method
        -   Lower Case text

        """
        text = tf_text.normalize_utf8(text, "NFKD")
        text = tf.strings.lower(text)
        return text

    def text_whitespace(self, text):
        """
        Parameters
        ------------
        text : Input string

        Returns
        ------------
        -   Remove $ and \\ special characters
        -   Add space around punctations
        -   Remove spaces around sentences

        """
        text = tf.strings.regex_replace(text, "[$\\\\]", "")
        text = tf.strings.regex_replace(text, "[.?!,¿()*:@]", r" \0 ")
        text = tf.strings.strip(text)
        return text

    def add_SOS_EOS(self, text):
        """
        Parameters
        ------------
        text : Input string

        Returns
        ------------
        -   Add <SOS> and <EOS> tags to each sentence

        """
        text = tf.strings.join([SOS, text, EOS], separator=" ")
        return text

    @classmethod
    def special_char(cls, text):
        """
        Parameters
        ------------
        text : Input string

        Returns
        ------------
        -   Special Characters found in Text using Regular Expression
        """
        return re.findall(r"[\W]", text.replace(" ", ""))

    def run_pipeline(self, text):
        """
        Parameters
        ------------
        text : Input string

        Returns
        ------------
        Executes series of Text pre processing functions

        """
        text = self.text_standardize(text)
        text = self.text_whitespace(text)
        text = self.add_SOS_EOS(text)
        self.text = text
        return self.text


class Features:
    """
    Extracts text features from data
    """

    def tf_lower_and_split_punct(self, text):
        """
        Parameters
        ------------
        text : Input string

        Returns
        ------------
        Standardized Text

        """
        return Preprocess(text).processed_text

    def vectorizor(self, document, max_vocab_size):
        """
        Parameters
        ------------
        document : Collection of sentences
        max_vocab_size : No of words in document used for TextVectorization

        Returns
        ------------
        TextVectorization object

        """
        text_processor = tf.keras.layers.TextVectorization(
            standardize=self.tf_lower_and_split_punct, max_tokens=max_vocab_size
        )
        text_processor.adapt(document)
        print("Sample Vocabulary", text_processor.get_vocabulary()[:10])
        return text_processor

In [5]:
try:
    del o
except:
    pass


o = Train_Data(
    "C:/Users/Lenovo/Downloads/pankhuri/CE888/assignment2/seq2seq/spider/",
    ["train_spider.json", "train_others.json"],
)

o.sql[0]

Reading file at path C:/Users/Lenovo/Downloads/pankhuri/CE888/assignment2/seq2seq/spider/train_spider.json
7000 Rows in Total
Reading file at path C:/Users/Lenovo/Downloads/pankhuri/CE888/assignment2/seq2seq/spider/train_others.json
8659 Rows in Total


'SELECT count(*) FROM head WHERE age  >  56'

In [7]:
max_input_vocab_size = o.get_vocab_size(o.questions)[0]
max_output_vocab_size = o.get_vocab_size(o.sql)[0]

input_text_processor = Features().vectorizor(o.questions, max_input_vocab_size)
output_text_processor = Features().vectorizor(o.sql, max_output_vocab_size)

Sample Vocabulary ['', '[UNK]', 'the', '[START]', '[END]', 'of', '?', '.', 'what', 'are']
Sample Vocabulary ['', '[UNK]', '.', 't1', 't2', '=', 'select', 'from', 'as', '[START]']


In [8]:
inp = o.questions
targ = o.sql

BUFFER_SIZE = len(inp)
BATCH_SIZE = 64

embedding_dim = 256
units = 1024

dataset = tf.data.Dataset.from_tensor_slices((inp, targ)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE)

In [9]:
import numpy as np

embeddings_dictionary = dict()
emb = "C:/Users/Lenovo/Downloads/pankhuri/CE888/assignment2/embedding/glove.6B.100d.txt"
glove_file = open(emb, encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = np.asarray(records[1:], dtype="float32")
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

In [10]:
input_text_processor.get_vocabulary()[2]

'the'

In [11]:
embeddings_dictionary.get(input_text_processor.get_vocabulary()[2])

array([-0.038194, -0.24487 ,  0.72812 , -0.39961 ,  0.083172,  0.043953,
       -0.39141 ,  0.3344  , -0.57545 ,  0.087459,  0.28787 , -0.06731 ,
        0.30906 , -0.26384 , -0.13231 , -0.20757 ,  0.33395 , -0.33848 ,
       -0.31743 , -0.48336 ,  0.1464  , -0.37304 ,  0.34577 ,  0.052041,
        0.44946 , -0.46971 ,  0.02628 , -0.54155 , -0.15518 , -0.14107 ,
       -0.039722,  0.28277 ,  0.14393 ,  0.23464 , -0.31021 ,  0.086173,
        0.20397 ,  0.52624 ,  0.17164 , -0.082378, -0.71787 , -0.41531 ,
        0.20335 , -0.12763 ,  0.41367 ,  0.55187 ,  0.57908 , -0.33477 ,
       -0.36559 , -0.54857 , -0.062892,  0.26584 ,  0.30205 ,  0.99775 ,
       -0.80481 , -3.0243  ,  0.01254 , -0.36942 ,  2.2167  ,  0.72201 ,
       -0.24978 ,  0.92136 ,  0.034514,  0.46745 ,  1.1079  , -0.19358 ,
       -0.074575,  0.23353 , -0.052062, -0.22044 ,  0.057162, -0.15806 ,
       -0.30798 , -0.41625 ,  0.37972 ,  0.15006 , -0.53212 , -0.2055  ,
       -1.2526  ,  0.071624,  0.70565 ,  0.49744 , 

In [15]:
num_words = max_input_vocab_size
EMBEDDING_SIZE = 100
embedding_matrix = np.zeros((num_words, EMBEDDING_SIZE))
for index, word in enumerate(input_text_processor.get_vocabulary()):
    # if word:
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [16]:
embedding_matrix[2]

array([-0.038194  , -0.24487001,  0.72812003, -0.39961001,  0.083172  ,
        0.043953  , -0.39140999,  0.3344    , -0.57545   ,  0.087459  ,
        0.28786999, -0.06731   ,  0.30906001, -0.26383999, -0.13231   ,
       -0.20757   ,  0.33395001, -0.33848   , -0.31742999, -0.48335999,
        0.1464    , -0.37303999,  0.34577   ,  0.052041  ,  0.44946   ,
       -0.46970999,  0.02628   , -0.54154998, -0.15518001, -0.14106999,
       -0.039722  ,  0.28277001,  0.14393   ,  0.23464   , -0.31020999,
        0.086173  ,  0.20397   ,  0.52623999,  0.17163999, -0.082378  ,
       -0.71787   , -0.41531   ,  0.20334999, -0.12763   ,  0.41367   ,
        0.55186999,  0.57907999, -0.33476999, -0.36559001, -0.54856998,
       -0.062892  ,  0.26583999,  0.30204999,  0.99774998, -0.80480999,
       -3.0243001 ,  0.01254   , -0.36941999,  2.21670008,  0.72201002,
       -0.24978   ,  0.92136002,  0.034514  ,  0.46744999,  1.10790002,
       -0.19358   , -0.074575  ,  0.23353   , -0.052062  , -0.22

In [12]:
max_input_len = input_text_processor.vocabulary_size()

In [21]:
from tensorflow.keras.layers import Embedding, Input, LSTM

embedding_layer = Embedding(
    num_words, EMBEDDING_SIZE, weights=[embedding_matrix], input_length=max_input_len
)

In [24]:
LSTM_NODES = 256
encoder_inputs_placeholder = Input(shape=(input_text_processor.vocabulary_size(),))
x = embedding_layer(encoder_inputs_placeholder)

encoder = LSTM(LSTM_NODES, return_state=True)

encoder_outputs, h, c = encoder(x)
encoder_states = [h, c]

In [26]:
decoder_inputs_placeholder = Input(shape=(output_text_processor.vocabulary_size(),))

decoder_embedding = Embedding(
    , LSTM_NODES)
decoder_inputs_x = decoder_embedding(decoder_inputs_placeholder)

decoder_lstm = LSTM(LSTM_NODES, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs_x, initial_state=encoder_states)

NameError: name 'num_words_output' is not defined

In [None]:
decoder_dense = Dense(num_words_output, activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

In [None]:
model = Model([encoder_inputs_placeholder, decoder_inputs_placeholder], decoder_outputs)
model.compile(
    optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"]
)

In [None]:
r = model.fit(
    dataset,
    decoder_targets_one_hot,
    epochs=EPOCHS,
)

In [31]:
# from random import randint
# import numpy as np
# from numpy import array
# from numpy import argmax
# from numpy import array_equal
# from tensorflow.keras.utils import to_categorical
# from keras.models import Model
# from keras.layers import Input
# from keras.layers import LSTM
# from keras.layers import Dense

# # generate a sequence of random integers
# def generate_sequence(length, n_unique):
# 	return [randint(1, n_unique-1) for _ in range(length)]

# # prepare data for the LSTM
# def get_dataset(n_in, n_out, cardinality, n_samples):
# 	X1, X2, y = list(), list(), list()
# 	for _ in range(n_samples):
# 		source = generate_sequence(n_in, cardinality)
#         # define padded target sequence
# 		target = source[:n_out]
# 		target.reverse()
#         # create padded input target sequence
# 		target_in = [0] + target[:-1]
#         # encode
# 		src_encoded = to_categorical([source], num_classes=cardinality)
# 		tar_encoded = to_categorical([target], num_classes=cardinality)
# 		tar2_encoded = to_categorical([target_in], num_classes=cardinality)
#         # store
# 		X1.append(src_encoded)
# 		X2.append(tar2_encoded)
# 		y.append(tar_encoded)
# 	X1 = np.squeeze(array(X1), axis=1)
# 	X2 = np.squeeze(array(X2), axis=1)
# 	y = np.squeeze(array(y), axis=1)
# 	return array(X1), array(X2), array(y)

# # returns train, inference_encoder and inference_decoder models
# def define_models(n_input, n_output, n_units):
# 	# define training encoder
# 	encoder_inputs = Input(shape=(None, n_input))
# 	encoder = LSTM(n_units, return_state=True)
# 	encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# 	encoder_states = [state_h, state_c]
# 	# define training decoder
# 	decoder_inputs = Input(shape=(None, n_output))
# 	decoder_lstm = LSTM(n_units, return_sequences=True, return_state=True)
# 	decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
# 	decoder_dense = Dense(n_output, activation='softmax')
# 	decoder_outputs = decoder_dense(decoder_outputs)
# 	model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
# 	# define inference encoder
# 	encoder_model = Model(encoder_inputs, encoder_states)
# 	# define inference decoder
# 	decoder_state_input_h = Input(shape=(n_units,))
# 	decoder_state_input_c = Input(shape=(n_units,))
# 	decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
# 	decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
# 	decoder_states = [state_h, state_c]
# 	decoder_outputs = decoder_dense(decoder_outputs)
# 	decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)
# 	# return all models
# 	return model, encoder_model, decoder_model

# # generate target given source sequence
# def predict_sequence(infenc, infdec, source, n_steps, cardinality):
# 	# encode
# 	state = infenc.predict(source)
# 	# start of sequence input
# 	target_seq = array([0.0 for _ in range(cardinality)]).reshape(1, 1, cardinality)
# 	# collect predictions
# 	output = list()
# 	for t in range(n_steps):
# 		# predict next char
# 		yhat, h, c = infdec.predict([target_seq] + state)
# 		# store prediction
# 		output.append(yhat[0,0,:])
# 		# update state
# 		state = [h, c]
# 		# update target sequence
# 		target_seq = yhat
# 	return array(output)

# # decode a one hot encoded string
# def one_hot_decode(encoded_seq):
# 	return [argmax(vector) for vector in encoded_seq]

# # configure problem
# n_features = 50 + 1
# n_steps_in = 6
# n_steps_out = 3
# # define model
# train, infenc, infdec = define_models(n_features, n_features, 128)
# train.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# # generate training dataset
# X1, X2, y = get_dataset(n_steps_in, n_steps_out, n_features, 100000)
# print(X1.shape,X2.shape,y.shape)
# # train model
# train.fit([X1, X2], y, epochs=1,callbacks=[tensorboard_cb])
# # evaluate LSTM
# total, correct = 100, 0
# for _ in range(total):
# 	X1, X2, y = get_dataset(n_steps_in, n_steps_out, n_features, 1)
# 	target = predict_sequence(infenc, infdec, X1, n_steps_out, n_features)
# 	if array_equal(one_hot_decode(y[0]), one_hot_decode(target)):
# 		correct += 1
# print('Accuracy: %.2f%%' % (float(correct)/float(total)*100.0))
# # spot check some examples
# for _ in range(10):
# 	X1, X2, y = get_dataset(n_steps_in, n_steps_out, n_features, 1)
# 	target = predict_sequence(infenc, infdec, X1, n_steps_out, n_features)
# 	print('X=%s y=%s, yhat=%s' % (one_hot_decode(X1[0]), one_hot_decode(y[0]), one_hot_decode(target)))

(100000, 6, 51) (100000, 3, 51) (100000, 3, 51)
Accuracy: 99.00%
X=[47, 49, 5, 49, 38, 39] y=[5, 49, 47], yhat=[5, 49, 47]
X=[5, 20, 23, 21, 11, 39] y=[23, 20, 5], yhat=[23, 20, 5]
X=[50, 21, 46, 25, 29, 24] y=[46, 21, 50], yhat=[46, 21, 50]
X=[10, 17, 1, 5, 37, 10] y=[1, 17, 10], yhat=[1, 17, 10]
X=[31, 35, 34, 33, 12, 43] y=[34, 35, 31], yhat=[34, 35, 31]
X=[2, 43, 48, 43, 5, 18] y=[48, 43, 2], yhat=[43, 48, 2]
X=[17, 4, 2, 12, 31, 7] y=[2, 4, 17], yhat=[2, 4, 17]
X=[30, 40, 23, 36, 6, 14] y=[23, 40, 30], yhat=[23, 40, 30]
X=[34, 1, 23, 16, 18, 7] y=[23, 1, 34], yhat=[23, 1, 34]
X=[32, 23, 19, 45, 23, 37] y=[19, 23, 32], yhat=[19, 23, 32]
