In [1]:
import numpy as np
import random
from random import shuffle
import os
import gensim
import re
import pickle
import tensorflow as tf
import operator
import math
import sys
from copy import deepcopy
from collections import Counter
from tensorflow.python.ops import rnn, rnn_cell
from tensorflow.python.framework import ops
from tensorflow.models.rnn.translate import seq2seq_model
import string
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.sql.functions import split, explode
from pyspark.sql import functions as func
from pyspark.sql.types import *
import re

In [2]:
sc = SparkContext('local[4]')
sqlcontext = SQLContext(sc)

In [3]:
def replace_punct(line):    
    punctuation = [x for x in list(string.punctuation)]
    space_punct = [' {0}'.format(elem) for elem in punctuation]
    replace_punctuation = str.maketrans(dict(zip(punctuation, space_punct)))
    line = line.translate(replace_punctuation)
    return line.lower()

In [4]:
def preproccessData(filename):
    
    data_dir = '/media/ai2-rey/data_disk/data_sets/SQuAD/'
    df = sqlcontext.read.json(data_dir+filename)
    df = df.withColumn('data', explode('data'))
    df = df.select('data.paragraphs')
    df = df.withColumn('paragraphs', explode('paragraphs'))
    df = df.select(['paragraphs.context','paragraphs.qas'])
    df = df.withColumn('qas', explode('qas'))
    df = df.selectExpr(['context as context','qas.question as question','qas.answers.text as answer'])
    df = df.withColumn('answer', explode('answer'))
    
    lowercase_UDF = func.UserDefinedFunction(replace_punct, StringType())

    df = (df.withColumn('context_lower', lowercase_UDF(df.context))
            .withColumn('question_lower', lowercase_UDF(df.question))
            .withColumn('answer_lower', lowercase_UDF(df.answer))
            .drop('context')
            .drop('question')
            .drop('answer')
            .withColumnRenamed('context_lower', 'context')
            .withColumnRenamed('question_lower', 'question')
            .withColumnRenamed('answer_lower', 'answer'))
    
    print(df.printSchema())
    
    context = df.select("context").rdd.flatMap(lambda x: x).collect()
    question = df.select("question").rdd.flatMap(lambda x: x).collect()
    answer = df.select("answer").rdd.flatMap(lambda x: x).collect()
    
    return context, question, answer

In [10]:
class QA_Dataset:
  
    def __init__(self, train_context, train_question, train_answer,
                 test_context, test_question, test_answer, max_vocab_size=None, 
                 autopad_context=None, autopad_ques = None, autopad_answer=None,
                 context_pad_len=None, ques_pad_len = None, answer_pad_len=None):

        assert autopad_context in {'min', 'max', 'avg'}
        assert autopad_answer in {'min', 'max', 'avg'}
        assert autopad_ques in {'min', 'max', 'avg'}
        
        self.train_context = train_context
        self.train_question = train_question
        self.train_answer = train_answer
        
        self.test_context = test_context
        self.test_question = test_question
        self.test_answer = test_answer
        
        self.max_vocab_size = max_vocab_size
        self.context_pad_len = context_pad_len
        self.ques_pad_len = ques_pad_len
        self.answer_pad_len = answer_pad_len
        self.__autopad_context = autopad_context
        self.__autopad_ques = autopad_ques
        self.__autopad_answer = autopad_answer
        
        self.vocab = set()
        self.word_counter = dict()
        self.word2id_dict = dict()
        self.id2word_dict = dict()
        
        self.train_context_ids = None
        self.train_question_ids = None
        self.train_answer_ids = None
        self.test_context_ids = None
        self.test_question_ids = None
        self.test_answer_ids = None
        
        self.train_context_text = None
        self.train_question_text = None
        self.train_answer_text = None
        self.test_context_text = None
        self.test_question_text = None
        self.test_answer_text = None
        
        self.num_tokens = None
        
        self.__parse_data()
        self.__create_word2id_dict()
        self.__numericize_data()

    def __update_word_counter(self, sequence):
        """ Update word_counter with counts for words in a sentence
        
        Args:
            sequence (list<str>) : list of words in a sequence
        
        """
        for word in sequence:
            self.word_counter[word] = self.word_counter.get(word, 0) + 1
            
    def __create_vocab(self):
        """ Create set of most frequent unique words found in the training data """
        
        if self.max_vocab_size == None:
            self.vocab = set(self.word_counter.keys())
        else:
            self.vocab = set(sorted(self.word_counter, key=self.word_counter.get, reverse=True)[:self.max_vocab_size])
        
    def __shuffle_data(self, data):
        random.shuffle(data)
        return list(zip(*data))
    
    def __parse_data(self):
        
        for idx in range(len(self.train_context)):
            self.__update_word_counter(self.train_context[idx].split())
            self.__update_word_counter(self.train_question[idx].split())
            self.__update_word_counter(self.train_answer[idx].split())
        
        self.__create_vocab()
        
        train_shuffle = list(zip(self.train_context, self.train_question, self.train_answer))        
        self.train_context, self.train_question, self.train_answer = self.__shuffle_data(train_shuffle)
        
        test_shuffle = list(zip(self.test_context, self.test_question, self.test_answer))
        self.test_context, self.test_question, self.test_answer = self.__shuffle_data(test_shuffle)
        
    
    def __create_word2id_dict(self):
        
        misc_tokens = ['PAD', 'UNK']
            
        for i, token in enumerate(misc_tokens):
            self.word2id_dict[token] = i
            
        for word in self.vocab:
            self.word2id_dict[word] = len(self.word2id_dict)

        self.vocab |= set(misc_tokens)
        self.id2word_dict = dict(zip(self.word2id_dict.values(), self.word2id_dict.keys()))
        self.num_tokens = len(self.word2id_dict)
    
    def __convert_word2id(self, word):
        try:
            word_id = self.word2id_dict[word]
        except:
            word_id = self.word2id_dict['UNK']
        return word_id

    def __convert_to_one_hot(self, answer_id):
        one_hot = [0 for i in range(self.num_tokens)]
        one_hot[answer_id] = 1
        return one_hot
    
    def __apply_padding(self, sequence, pad_len):
        if len(sequence) < pad_len:
            sequence += [self.word2id_dict['PAD'] for i in range(pad_len - len(sequence))]
        elif len(sequence) > pad_len:
            sequence = sequence[:pad_len]
        else:
            pass
        return sequence
        
    def __get_seq_length_stats(self, sequences):
        max_len = 0
        min_len = 100000
        avg_len = 0
        for sequence in sequences:
            max_len = max(max_len, len(sequence))
            min_len = min(min_len, len(sequence))
            avg_len += len(sequence)
        avg_len = int(float(avg_len) / len(sequences))
        return min_len, max_len, avg_len

    def __get_max_sequence_lengths(self, context_ids, question_ids, answer_ids):
        min_context_len, max_context_len, avg_context_len = self.__get_seq_length_stats(context_ids)
        min_ques_len, max_ques_len, avg_ques_len = self.__get_seq_length_stats(question_ids)
        min_answer_len, max_answer_len, avg_answer_len = self.__get_seq_length_stats(answer_ids)

        if self.context_pad_len == None:
            if self.__autopad_context != None:
                if self.__autopad_context == 'min':
                    self.context_pad_len = min_context_len
                elif self.__autopad_context == 'max':
                    self.context_pad_len = max_context_len
                elif self.__autopad_context == 'avg':
                    self.context_pad_len = avg_context_len
            else:
                self.context_pad_len = avg_context_len
                
        if self.ques_pad_len == None:
            if self.__autopad_ques != None:
                if self.__autopad_ques == 'min':
                    self.ques_pad_len = min_ques_len
                elif self.__autopad_ques == 'max':
                    self.ques_pad_len = max_ques_len
                elif self.__autopad_ques == 'avg':
                    self.ques_pad_len = avg_ques_len
            else:
                self.ques_pad_len = avg_ques_len
                
        if self.answer_pad_len == None:
            if self.__autopad_answer != None:
                if self.__autopad_answer == 'min':
                    self.answer_pad_len = min_answer_len
                elif self.__autopad_answer == 'max':
                    self.answer_pad_len = max_answer_len
                elif self.__autopad_answer == 'avg':
                    self.answer_pad_len = avg_answer_len
            else:
                self.answer_pad_len = avg_answer_len 
                
        print('Context length stats: min = {}, max = {}, avg = {}'.format(min_context_len, max_context_len, avg_context_len))
        print('Context pad length set to: {}'.format(self.context_pad_len))
        
        print('Question length stats: min = {}, max = {}, avg = {}'.format(min_ques_len, max_ques_len, avg_ques_len))
        print('Question pad length set to: {}'.format(self.ques_pad_len))        
        
        print('Answer length stats: min = {}, max = {}, avg = {}'.format(min_answer_len, max_answer_len, avg_answer_len))
        print('Answer pad length set to: {}'.format(self.answer_pad_len))        

    def __tokenize_sentences(self, context_ids, question_ids, answer_ids):
        """ Tokenizes sentences.
        :param raw: dict returned from load_babi
        :param word_table: WordTable
        :return:
        """
        context_tokens = []
        question_tokens = []
        answer_tokens = []
        
        for i in range(len(context_ids)):
            c_tkn_ids = self.__apply_padding(context_ids[i], self.context_pad_len)
            context_tokens.append(c_tkn_ids)
            q_tkn_ids = self.__apply_padding(question_ids[i], self.ques_pad_len)
            question_tokens.append(q_tkn_ids)
            a_tkn_ids = self.__apply_padding(answer_ids[i], self.answer_pad_len)
            answer_tokens.append(a_tkn_ids)
        return context_tokens, question_tokens, answer_tokens
        
    def __convert_text2ids(self, context, question, answer):
        """ Tokenizes sentences.
        :param raw: dict returned from load_babi
        :param word_table: WordTable
        :return:
        """
        context_ids = []
        question_ids = []
        answer_ids = []
        for i in range(len(context)):
            c_ids = [self.__convert_word2id(word) for word in context[i].split()]
            context_ids.append(c_ids)
            q_ids = [self.__convert_word2id(word) for word in question[i].split()]
            question_ids.append(q_ids)
            a_ids = [self.__convert_word2id(word) for word in answer[i].split()]
            answer_ids.append(a_ids)            
        return context_ids, question_ids, answer_ids
          
    def __convert_text2words(self, context, question, answer):
        context_words = []
        question_words = []
        answer_words = []
        for i in range(len(context)):
            c_words = context[i].split()
            context_words.append(c_words)
            q_words = question[i].split()
            question_words.append(q_words)
            a_words = answer[i].split()
            answer_words.append(a_words)
        return context_words, question_words, answer_words    
            
        
    def __numericize_data(self):
        train_c, train_q, train_a = self.__convert_text2ids(self.train_context, self.train_question, self.train_answer)
        test_c, test_q, test_a = self.__convert_text2ids(self.test_context, self.test_question, self.test_answer)
        
        self.__get_max_sequence_lengths(train_c, train_q, train_a)
        
        self.train_context_ids, self.train_question_ids, self.train_answer_ids = self.__tokenize_sentences(train_c, train_q, train_a)
        self.test_context_ids, self.test_question_ids, self.test_answer_ids = self.__tokenize_sentences(test_c, test_q, test_a)
        
        self.train_context_text, self.train_question_text, self.train_answer_text = self.__convert_text2words(self.train_context,
                                                                                                              self.train_question,
                                                                                                              self.train_answer)
        self.test_context_text, self.test_question_text, self.test_answer_text = self.__convert_text2words(self.test_context,
                                                                                                           self.test_question,
                                                                                                           self.test_answer)

In [28]:
class Seq2Seq:
 
    def __init__(self, vocab_size, xseq_len, yseq_len, test_batches, num_layers, lr_rate=0.001, 
                 momentum = 0.9, n_hidden=256, word_dim=100, 
                 dropout_rate=1., gpu_device=0, model_dir=None):

        self.vocab_size = vocab_size
        self.xseq_len = xseq_len
        self.yseq_len = yseq_len
        self.test_batches = test_batches
        self.num_layers = num_layers
        self.lr_rate = lr_rate
        self.momentum = momentum
        self.n_hidden = n_hidden
        self.word_dim = word_dim
        self.dropout_rate = dropout_rate
        self.gpu_device = gpu_device
        self.model_dir = self.__prepare_model_dir(model_dir)
        self.__keep_prob = None
        
        self.__graph = tf.Graph()
        
        self.__build_model()
    
    def __prepare_model_dir(self, model_dir):
        """ Checks model directory for a weights folder and creates one if none exists
        
        Args:
            model_dir (str) : defines directory location to save weights and training log file
            
        Returns:
            str : directory location with weights folder
        
        """
        if model_dir == None:
            model_dir = os.getcwd() + '/'
        else:
            if model_dir[-1] != '/':
                model_dir = model_dir + '/'
            else:
                model_dir = model_dir
        
        if not os.path.exists(model_dir + 'weights'):
            os.makedirs(model_dir + 'weights')
        return model_dir
                   
    def __build_model(self):
        """ Creates computation graph for dual encoder LSTM. Includes structure for training and deploying. """
        
        tf.reset_default_graph()
        gpu_device_name = '/gpu:{}'.format(self.gpu_device)
        
        with self.__graph.as_default():
            with tf.device(gpu_device_name):
                # define placeholder variables for model inputs
                self.enc_inp = [tf.placeholder(shape=[None,], 
                                               dtype=tf.int32, 
                                               name='ei_{}'.format(t)) for t in range(self.xseq_len)]
                self.labels = [tf.placeholder(shape=[None,], 
                                               dtype=tf.int32, 
                                               name='ei_{}'.format(t)) for t in range(self.yseq_len)]
                self.dec_inp = ([tf.zeros_like(self.enc_inp[0], dtype=np.int32, name="GO")]+self.labels[:-1]) 
                
                self.__keep_prob = tf.placeholder(tf.float32)


                # Build the RNN
                with tf.variable_scope("decoder"):
                    # We use an LSTM Cell
                    #cell = tf.nn.rnn_cell.LSTMCell(self.n_hidden, forget_bias=2.0, state_is_tuple=True)
                    cell = tf.nn.rnn_cell.DropoutWrapper(
                           tf.nn.rnn_cell.BasicLSTMCell(word_dim, state_is_tuple=True),
                           output_keep_prob=self.__keep_prob)
                    
                    stacked_lstm = tf.nn.rnn_cell.MultiRNNCell([cell]*self.num_layers, state_is_tuple=True)
                    
                    self.dec_outputs, self.dec_states = tf.nn.seq2seq.embedding_rnn_seq2seq(
                                                        self.enc_inp, self.dec_inp, stacked_lstm, 
                                                        vocab_size, vocab_size, word_dim)
                    
                    tf.get_variable_scope().reuse_variables()

                    self.dec_outputs_test, self.dec_states_test = tf.nn.seq2seq.embedding_rnn_seq2seq(
                                                        self.enc_inp, self.dec_inp, stacked_lstm, 
                                                        vocab_size, vocab_size, word_dim,
                                                        feed_previous=True) 
                    
                loss_weights = [tf.ones_like(l, dtype=tf.float32) for l in self.labels]

                self.__loss = tf.nn.seq2seq.sequence_loss(self.dec_outputs, self.labels, loss_weights, vocab_size)

                self.__optimizer = tf.train.AdamOptimizer(learning_rate=self.lr_rate).minimize(self.__loss)

                self.__init = tf.global_variables_initializer()
    
    def get_feed(self,X,Y, keep_prob):
        feed_dict={self.enc_inp[t]: X[t] for t in range(self.xseq_len)}
        feed_dict.update({self.labels[t]: Y[t] for t in range(self.yseq_len)})
        feed_dict[self.__keep_prob]=keep_prob
        return feed_dict

    def train_batch(self,sess, data_iter):
        X,Y = data_iter.next_batch()
        feed_dict = self.get_feed(X,Y, self.dropout_rate)
        _, loss_v = sess.run([self.__optimizer, self.__loss], feed_dict)
        return loss_v
     
    def train(self, train_data_iter, test_data_iter, deploy_data_iter,
              deploy_interval = 1000, train_iters=10000, display_step=200, 
              save_weights_interval=5000, id2word_dict=None, weights_prefix=None):

        
        with tf.Session(graph=self.__graph, config=tf.ConfigProto(allow_soft_placement=True)) as sess:
            sess.run(self.__init)
            saver = tf.train.Saver(max_to_keep=100)
            # Keep training until reach max iterations
            for train_iter in range(train_iters):
                train_iter += 1
                
                loss = self.train_batch(sess, train_data_iter)
                if train_iter % display_step == 0:

                    train_loss_string = "Iter {}, Minibatch Loss = {:.6f}".format(train_iter, loss)
                    print(train_loss_string)
                
                if train_iter % save_weights_interval ==0:
                    if weights_prefix != None:
                        weights_dir = self.model_dir + "weights/{}_iter-{}.cpkt".format(weights_prefix,train_iter)
                    else:
                        weights_dir = self.model_dir + "weights/QA_seq2seq_weights_iter-{}.ckpt".format(train_iter)
                    save_path = saver.save(sess, weights_dir)
                    save_string = "Model saved in file: {}".format(save_path)
                    print(save_string)
                    test_loss, test_accuracy = self.test(sess, test_data_iter, self.test_batches)
                    print('Test loss @ iter {}: {} '.format(train_iter, test_loss))
                    print('Test Accuracy @ iter {}: {} '.format(train_iter, test_accuracy))
                
#                 if deploy_data_iter !=None and train_iter % deploy_interval ==0:
#                     d_X, d_Y = deploy_data_iter.next_batch()
#                     d_feed_dict = self.get_feed(d_X,d_Y, 1.)
                                        
                    
    def test_step(self, test_data_iter, sess):
        testX, testY = test_data_iter.next_batch()
        feed_dict = self.get_feed(testX, testY, keep_prob=1.)
        loss_v, dec_op_v = sess.run([self.__loss, self.dec_outputs_test], feed_dict)
        dec_op_v = np.array(dec_op_v).transpose([1,0,2])
        return loss_v, dec_op_v, testX, testY
    
    def test(self, sess, test_data_iter, num_batches):
        losses= []
        predict_loss = []
        for i in range(num_batches):
            loss_t, dec_op_t, batchX, batchY = self.test_step(test_data_iter, sess)
            losses.append(loss_t)
            
            for idx in range(len(dec_op_t)):
                real = batchY.T[idx]
                predict = np.argmax(dec_op_t, axis=2)[idx]
                predict_loss.append(all(real==predict))
        return np.mean(losses), np.mean(predict_loss)        
    
    def predict(self, ckpt_file, X):
        with tf.Session(graph=self.__graph, config=tf.ConfigProto(allow_soft_placement=True)) as sess:
            sess.run(self.__init)
            saver = tf.train.Saver()
            saver.restore(sess, self.model_dir + 'weights/' + ckpt_file)
            feed_dict = {self.enc_inp[t]: X[t] for t in range(self.xseq_len)}
            feed_dict[self.__keep_prob] = 1.
            dec_op_v = sess.run(self.dec_outputs_test, feed_dict)
            # dec_op_v is a list; also need to transpose 0,1 indices 
            #  (interchange batch_size and timesteps dimensions
            dec_op_v = np.array(dec_op_v).transpose([1,0,2])
            # return the index of item with highest probability
            return np.argmax(dec_op_v, axis=2) 
        
    def test_try(self, ckpt_file, test_data_iter):
        with tf.Session(graph=self.__graph, config=tf.ConfigProto(allow_soft_placement=True)) as sess:
            sess.run(self.__init)
            saver = tf.train.Saver()
            saver.restore(sess, self.model_dir + 'weights/' + ckpt_file)
            accuracy = []
            loss_t, out_t, X, Y = self.test_step(test_data_iter, sess)
            
            for idx in range(len(out_t)):
                real = Y.T[idx]
                prediction = np.argmax(out_t, axis=2)[idx]
                accuracy.append(all(real==prediction))
            return np.mean(accuracy)

In [6]:
class DataIterator:
    def __init__(self, data, batch_size):
        self.data = data
        self.batch_size = batch_size
        self.data_iterator = self.make_random_iter()
        
    def next_batch(self):
        try:
            idxs = next(self.data_iterator)
        except StopIteration:
            self.data_iterator = self.make_random_iter()
            idxs = next(self.data_iterator)
            
        batch = [self.data[i] for i in idxs]
        batch_idxs = [idx for idx in idxs]
        return batch, batch_idxs

    def make_random_iter(self):
        splits = np.arange(self.batch_size, len(self.data), self.batch_size)
        it = np.split(np.random.permutation(range(len(self.data))), splits)[:-1]
        return iter(it)

In [7]:
train_context, train_question, train_answer = preproccessData('train-v1.1.json')
test_context, test_question, test_answer = preproccessData('dev-v1.1.json')

root
 |-- context: string (nullable = true)
 |-- question: string (nullable = true)
 |-- answer: string (nullable = true)

None
root
 |-- context: string (nullable = true)
 |-- question: string (nullable = true)
 |-- answer: string (nullable = true)

None


In [11]:
data = QA_Dataset(train_context, train_question, train_answer,
                 test_context, test_question, test_answer,
                 autopad_context='max', autopad_answer='avg', autopad_ques='max')

Context length stats: min = 22, max = 766, avg = 138
Context pad length set to: 766
Question length stats: min = 1, max = 60, avg = 11
Question pad length set to: 60
Answer length stats: min = 1, max = 46, avg = 3
Answer pad length set to: 3


In [12]:
train_data = list(zip(data.train_context_ids, data.train_question_ids, data.train_answer_ids))
test_data = list(zip(data.test_context_ids, data.test_question_ids, data.test_answer_ids))

In [None]:
train_data_iter = DataIterator(train_data, 256)
test_data_iter = DataIterator(test_data, 358)
deploy_data_iter = DataIterator(test_data, 1)

In [None]:
len(test_data)/97

In [None]:
vocab_size = len(data.vocab) 

xseq_len = data.context_pad_len
yseq_len = data.answer_pad_len
test_batches = 97 
num_layers = 3
lr_rate = 0.001
momentum = 0.9
n_hidden = 256 
word_dim = 100
dropout_rate = 0.5
gpu_device = 0
model_dir = None

In [None]:
model = Seq2Seq(xseq_len=xseq_len,
               yseq_len=yseq_len, 
               vocab_size=vocab_size,
               word_dim = word_dim,
               test_batches = test_batches
               num_layers = num_layers,
               dropout_rate=0.5,
               gpu_device = gpu_device)

In [None]:
sess = model.train(train_data_iter, test_data_iter, deploy_data_iter,
                  train_iters=1000, display_step=200,
                  save_weights_interval = 200, id2word_dict=data.id2word_dict, 
                  weights_prefix='SQuAD_QA')

In [25]:
len(data.vocab)

105183

In [26]:
len(data.word2id_dict)

105183

In [27]:
len(data.word_counter)

105181