In [1]:
import numpy as np
import tensorflow as tf
import json 
import re
import math
import os 
import datetime 
import random
import sys

In [2]:
#util
def shuffle_in_unison(a, b):
    rng_state = np.random.get_state()
    np.random.shuffle(a)
    np.random.set_state(rng_state)
    np.random.shuffle(b)

def one_hot(idx, size):
    v = np.zeros(size)
    v[idx] = 1
    return v

def rand_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)

def weight_var(shape):
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)

In [3]:
#helpers
def tokenize(text):
    ignore_chars = r"[.,'!?_-]"
    text = re.sub(ignore_chars, " ", text)
    words = text.lower().split()
    return [w for w in words if w != '']

def print_distribution(numbers):
    n = sorted(numbers)
    return ("{} - {}, with median {} and 90th-percentile {}".format(n[0], n[-1], n[int(len(n)/2)], n[int(len(n)*0.9)]))

assert tokenize('abc def') == ['abc', 'def']
assert tokenize('abc-def.ghi') == ['abc', 'def', 'ghi']
assert tokenize('A  B  ') == ['a', 'b']

In [4]:
#net
class Net(object):
    def __init__(self, dir_path=None, **kwargs):
        
        for k, v in kwargs.iteritems():
            setattr(self, k, v)
        
        self.setup()
        self.session = tf.Session()
        self.session.run(tf.initialize_all_variables())
        
        self.was_restored = False
        if dir_path:
            if dir_path[-1] != '/': dir_path += '/'
            if not os.path.exists(dir_path):
                os.mkdir(dir_path)
            self.dir_path = dir_path
            
            self.saver = tf.train.Saver()
            ckpt = tf.train.get_checkpoint_state(self.dir_path)
            if ckpt and ckpt.model_checkpoint_path:
                self.saver.restore(self.session, ckpt.model_checkpoint_path)
                self.was_restored = True
                print("Restored model from checkpoint {0}".format(ckpt.model_checkpoint_path))
        else:
            self.saver = None
        
    def setup(self):
        raise NotImplementedError("Subclasses should implement setup()")
        
    def train(self, inputs, outputs):
        raise NotImplementedError("Subclasses should implement train()")
    
    def predict(self, inputs):
        raise NotImplementedError("Subclasses should implement predict()")
    
    def save(self, step):
        if self.saver:
            self.saver.save(self.session, self.dir_path + 'model.ckpt', global_step=step)
    
    def evaluate(self, inputs, outputs):
        raise NotImplementedError("Subclasses should implement evaluate()")
    
    def training_loop(self, training_batch_generator, testing_batch_generator, evaluation_interval=10):
        step = 0
        for step, (inp, out) in enumerate(training_batch_generator):
            self.train(inp, out)
            if step % evaluation_interval == 0:
                inp, out = next(testing_batch_generator)
                print(self.evaluate(inp, out))
                self.save(step)
        print(self.evaluate(inp, out))
        self.save(step)

def random_batch(inputs, outputs, count=100):
    indices = np.random.randint(0, len(inputs)-1, count)
    return inputs.take(indices, axis=0), outputs.take(indices, axis=0)

def batch_generator(inputs, outputs, size=100, batches=None, epochs=None, random=False, print_progress=False):
    if epochs is not None:
        batches = int(math.ceil(len(inputs) * 1.0 / size))
    
    last_printed = datetime.datetime.now()
    
    step = 0
    while True:
        if random:
            indices = np.random.randint(0, len(inputs)-1, size)
            yield inputs.take(indices, axis=0), outputs.take(indices, axis=0)
        else:
            start_index = step * size % len(inputs)
            end_index = min(start_index + size, len(inputs))
            yield inputs[start_index:end_index], outputs[start_index:end_index]
        step += 1
        if print_progress and batches and (datetime.datetime.now() - last_printed).total_seconds() > 4:
            last_printed = datetime.datetime.now()
            print("{0}%".format(step * 100.0 / batches))
        if batches is not None and step >= batches:
            break

In [5]:
#dataset
data_dir = '/media/ai2-rey/data_disk/data_sets/SQuAD/'
class Dataset(object):
    def __init__(self, path):
        data = json.load(open(path))
        self.paragraphs = [Paragraph(p) for article in data['data'] for p in article['paragraphs']]

class Paragraph(object):
    def __init__(self, data):
        self.passage = data['context']
        self.qas = [QA(qa) for qa in data['qas']]

class QA(object):
    def __init__(self, data):
        self.question = data['question']
        self.answers = data['answers'] # array of dictionaries, with keys `answer_start` (a character index) and `text`

train = Dataset(data_dir+'train-v1.1.json')
test = Dataset(data_dir+'dev-v1.1.json')
passage_length_distribution = [len(tokenize(p.passage)) for p in train.paragraphs]
question_length_distribution = [len(tokenize(q.question)) for para in train.paragraphs for q in para.qas]
print( "Passage length distribution (tokens):", print_distribution(passage_length_distribution))
print( "Question length distribution (tokens):", print_distribution(question_length_distribution))

Passage length distribution (tokens): 20 - 678, with median 110 and 90th-percentile 184
Question length distribution (tokens): 1 - 40, with median 10 and 90th-percentile 15


In [9]:
print(train.paragraphs[0].passage)
print(train.paragraphs[0].qas[0].question)

Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.
To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?


In [7]:
#embedding 
UNKNOWN = "*UNKNOWN*"

def generate_embedding_files(filename):
    embeddings = {}
    for line in open(filename):
        parts = line.split()
        embeddings[parts[0]] = list(map(float, parts[1:]))
    embedding_size = len(list(embeddings.values())[0])
    embeddings[UNKNOWN] = [0.0 for _ in range(embedding_size)]
    
    words = embeddings.keys()
    embedding_matrix = np.array([embeddings[word] for word in list(embeddings.keys())])
    return words, embedding_matrix

def embeddings():
    #words = json.load(open('cache/vocab.json'))
    #matrix = np.load(open('cache/embedding.npy'))
    return words, embedding_matrix

words, embedding_matrix = generate_embedding_files('/media/ai2-rey/data_disk/data_sets/glove.6B/glove.6B.50d.txt')

In [25]:
vocab, embedding_matrix = embeddings()
vocab_lookup = {word: i for i, word in enumerate(vocab)}
vocab_size, embedding_size = embedding_matrix.shape

passage_max_length = 200
question_max_length = 20

hidden_size = 50

def vectorize(text, fixed_length=None):
    vocab_size = len(vocab_lookup)
    tokens = tokenize(text)
    if fixed_length is not None:
        tokens = (tokens + [0] * max(0, fixed_length - len(tokens)))[:fixed_length]
    return np.array([vocab_lookup.get(token, vocab_lookup[UNKNOWN]) for token in tokens])

def output_mask(passage, answers):
    # returns a vector -- same length as the passage -- with 1 if the token is part of an answer, otherwise 0
    answer_marker = "$$answer$" # hack !! !!! !  !  yikes ! ! ! ! !
    for answer in answers:
        replacement = " ".join([answer_marker + w for w in tokenize(answer)])
        passage = passage.replace(answer, replacement)
    return np.array([(1 if token.startswith(answer_marker) else 0) for token in tokenize(passage)])

def make_fixed_length(a, length):
    return np.concatenate([a, [0] * (length - len(a))]) if len(a) < length else a[:length]

def vectors_from_question(para, qa):
    # return an (input, output) tuple, where input is a tuple (passage, question) and output is an output mask
    print(type(para.passage))
    print(qa.answers[0]['text'])
    passage = vectorize(para.passage, fixed_length=passage_max_length)
    # print para.passage
    question = vectorize(qa.question, fixed_length=question_max_length)
    # print qa.question
    # print [answer['text'] for answer in qa.answers]
    mask = output_mask(para.passage, [answer['text'] for answer in qa.answers])
    mask = make_fixed_length(mask, passage_max_length)
    return (passage, question), mask

In [35]:
for i in range(20):    
    print(train.paragraphs[i].qas[0].answers[0]['text'])

Saint Bernadette Soubirous
September 1876
Rome
eight
Learning Resource Center
1924
Joan B. Kroc Institute for International Peace Studies
14
3,577
18th overall
Father Joseph Carrier, C.S.C.
1882
The Lobund Institute
The Review of Politics
John Jenkins
8,448
80%
Congregatio a Sancta Cruce
Washington Hall
scholastic and classical


In [31]:
(a,b), c = vectors_from_question(train.paragraphs[1], train.paragraphs[12].qas[1])
print(a)
print(b)
print(c)

<class 'str'>
the 1940s
[ 92178 231801 271140 320117 194553  48384 251511 309502 246252 331526
 123312 349485  69676  64890  88846  41309  19814 185030  38690 331526
  41309  94653 167381 185710 108035 123312  35809  85041 139361  11166
  85041  41546 172559  85041 333475 164460  92178 123312  82442 286349
 239654 126155 379080 173234  19814 168850 330996 213770 161473 121309
 209216  85041   2971 297314 331247  19814 336198 193380 111238 299432
 126155  19814 388236 157727  19814 320117 330996  19814 360138 213770
  57092 121309 123312  44753  85041 193243 133170  38690 154637  85041
     99  19814 116177 175473 213770  51506  83922  19814 185710 288040
 352366 299432  23255 188624  19814 145139  51506 385142  85041 290277
  76140 366442  85041 320117  64890  85041 337697 294263 246252 123789
 108035  48384 251511  85041 202358  30049 309502 341100 297521 168850
  85041  19814 116177  19814 145139 213770 323215 152219 299432  85041
  18142 394130 288040 123312 120788 344390 210773 122

In [None]:
#SQuAD

def questions_from_dataset(ds):
    for para in ds.paragraphs:
        for qa in para.qas:
            yield (para, qa)

class Squad(Net):
    def setup(self):
        passage = tf.placeholder(tf.int32, [None, passage_max_length], name='passage')  # shape (batch_size, passage_max_length)
        question = tf.placeholder(tf.int32, [None, question_max_length], name='question')  # shape (batch_size, question_max_length)
        desired_output = tf.placeholder(tf.float32, [None, passage_max_length], name='desired_output')  # shape (batch_size, passage_max_length)
        
        embedding = tf.constant(embedding_matrix, name='embeddintf.nn

        #######################
        # Preprocessing layer #
        ####################### 

        passage_embedded = tf.nn.embedding_lookup(embedding, passage)  # shape (batch_size, passage_max_length, embedding_size)
        question_embedded = tf.nn.embedding_lookup(embedding, question)  # shape (batch_size, question_max_length, embedding_size)

        dropout = tf.placeholder(tf.float32)

        with tf.variable_scope('passage_lstm'):
            passage_cell = tf.nn.rnn_cell.LSTMCell(hidden_size)
            passage_cell = tf.nn.rnn_cell.DropoutWrapper(passage_cell, output_keep_prob=dropout)
            passage_cell = tf.nn.rnn_cell.MultiRNNCell([passage_cell] * 2)
            H_p, _ = tf.nn.dynamic_rnn(passage_cell, passage_embedded, dtype=tf.float32)  # shape (batch_size, passage_max_length, hidden_size)
        
        with tf.variable_scope('question_lstm'):
            question_cell = tf.nn.rnn_cell.LSTMCell(hidden_size)
            question_cell = tf.nn.rnn_cell.DropoutWrapper(question_cell, output_keep_prob=dropout)
            question_cell = tf.nn.rnn_cell.MultiRNNCell([question_cell] * 2)
            H_q, _ = tf.nn.dynamic_rnn(question_cell, question_embedded, dtype=tf.float32)  # shape (batch_size, question_max_length, hidden_size)


        ####################
        # Match-LSTM layer #
        ####################

        # Weights and bias to compute `G`
        W_q = self.weight_variable(shape=[hidden_size, hidden_size])
        W_p = self.weight_variable(shape=[hidden_size, hidden_size])
        W_r = self.weight_variable(shape=[hidden_size, hidden_size])
        b_p = self.bias_variable(shape=[hidden_size])

        # Weight and bias to compute `a`
        w = self.weight_variable(shape=[hidden_size])
        b_alpha = self.bias_variable(shape=[])   # In the paper, this is `b`

        # Only calculate `WH_q` once
        WH_q = tf.matmul(W_q, H_q)

        # Results for forward and backward LSTMs
        H_r_forward = []
        H_r_backward = []

        with tf.variable_scope('forward_match_lstm'):
            forward_cell = tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.LSTMCell(hidden_size), output_keep_prob=dropout)
            forward_state = forward_cell.zero_state(batch_size, dtype=tf.float32)
            h = forward_state.h
            for i in range(len(H_p)):
                G_forward = tf.tanh(WH_q + tf.tile((tf.matmul(W_p, H_p[i]) + tf.matmul(W_r, h) + b_p), [question_max_length, 1]))
                alpha_forward = tf.nn.softmax(w * G_forward + tf.tile(b_alpha, [question_max_length, 1]))

                z_forward = tf.concatenate(H_p[i], H_q * alpha_forward[i])
                h, forward_state = forward_cell(z_forward, forward_state)
                H_r_forward.append(h)

        with tf.variable_scope('backward_match_lstm'):
            backward_cell = tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.LSTMCell(hidden_size), output_keep_prob=dropout)
            backward_state = backward_cell.zero_state(batch_size, dtype=tf.float32)
            h = backward_state.h
            for i in reversed(range(len(H_p))):
                G_backward = tf.tanh(WH_q + tf.tile((tf.matmul(W_p, H_p[i]) + tf.matmul(W_r, h) + b_p), [question_max_length, 1]))
                alpha_backward = tf.nn.softmax(w * G_backward + tf.tile(b_alpha, [question_max_length, 1]))

                z_backward = tf.concatenate(H_p[i], H_q * alpha_backward[i])
                h, backward_state = backward_cell(z_backward, backward_state)
                H_r_backward.append(h)

        # After finding forward and backward `H_r[i]` for all `i`, concatenate `H_r_forward` and `H_r_backward`
        H_r = tf.concatenate(H_r_forward, H_r_backward)

        # TODO: Assert that the shape of `H_r` is (2 * hidden_size, passage_max_length)


        ########################
        # Answer-Pointer layer #
        ########################

        # TODO: Switch this over to boundary model or add zero vector padding at end of H_r
        #       ^ Might not be necessary ??

        # Weights and bias to compute `F`
        V = self.weight_variable(shape=[hidden_size, 2 * hidden_size])
        W_a = self.weight_variable(shape=[hidden_size, hidden_size])
        b_a = self.bias_variable(shape=[hidden_size])   # In the paper, this is `c`

        # Weight and bias to compute `beta`
        v = self.weight_variable(shape=[hidden_size])
        b_beta = self.bias_variable(shape=[])

        # Only calculate `VH` once
        VH = tf.matmul(V, H_r)        # shape (hidden_size, passage_max_length)

        H_a = []

        with tf.variable_scope('answer_pointer_lstm'):
            pointer_cell = tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.LSTMCell(hidden_size), output_keep_prob=dropout)
            pointer_state = pointer_cell.zero_state(batch_size, dtype=tf.float32)
            h = pointer_state.h
            for k in range(len(H_p)):
                F = tf.tanh(VH + tf.tile((tf.matmul(W_a, H_a[k]) + b_a), [passage_max_length, 1]))
                beta = tf.nn.softmax(v * F + tf.tile(b_beta, [passage_max_length, 1]))

                h, pointer_state = pointer_cell(tf.matmul(H_r, beta), pointer_state)
                H_a.append(h)
        

        # TODO: Replace the loss function below with the loss function from the paper
        loss = tf.reduce_mean(tf.reduce_sum(tf.pow(desired_output - output, 2), reduction_indices=[1]))
        train_step = tf.train.AdamOptimizer(0.001).minimize(loss)
        
        self.passage = passage
        self.question = question
        self.output = output
        self.desired_output = desired_output
        self.train_step = train_step
        self.loss = loss
        self.dropout = dropout
    
    def train(self, paragraph_question_pairs):
        vectors = [vectors_from_question(p, q) for p, q in paragraph_question_pairs]
        # print vectors[0]
        questions = np.array([question for ((passage, question), mask) in vectors])
        passages = np.array([passage for ((passage, question), mask) in vectors])
        masks = np.array([mask for ((passage, question), mask) in vectors])
        
        feed = {self.passage: passages, self.question: questions, self.desired_output: masks, self.dropout: 0.5}
        _, loss = self.session.run([self.train_step, self.loss], feed_dict=feed)
        print loss

    @staticmethod
    def weight_variable(shape, name=None):
        initial = tf.truncated_normal(shape, stddev=0.1)
        return tf.Variable(initial, name=name)

    @staticmethod
    def bias_variable(shape, name=None):
        initial = tf.constant(0.1, shape=shape)
        return tf.Variable(initial, name=name)

def iterate_batches(list, size=10):
    i = 0
    while True:
        yield [list[i+j] for j in range(size)]
        i += size

n = Squad(dir_path='save/squad1')

def train():
    train_questions = list(questions_from_dataset(train))
    random.shuffle(train_questions)
    test_questions = list(questions_from_dataset(test))
    random.shuffle(test_questions)
    
    i = 0
    for i, batch in enumerate(iterate_batches(train_questions, size=20)):
        n.train(batch)
        if i % 10 == 0:
            n.save(i)



In [None]:
train()