In [1]:
import codecs
import tempfile

import numpy as np
import tensorflow as tf
import time
import math
import external_lib as el

In [2]:
#full_glove_path, = el.download_and_unzip(
#  'http://nlp.stanford.edu/data/', 'glove.840B.300d.zip',
#  'glove.840B.300d.txt', data_dir = "./data_sources/glove.6B/")

In [3]:
filtered_glove_path = 'data/sick_filtered_glove.txt'

In [4]:
full_glove_path = 'data_sources/glove.6B/glove.840B.300d.txt'
#el.filter_glove(full_glove_path, filtered_glove_path)

In [5]:
embedding_matrix, word_to_idx = el.load_embeddings(filtered_glove_path)

loading word embeddings from data/sick_filtered_glove.txt


In [6]:
"""Tree node class"""
class Node(object):
    def __init__(self, data, parent=None):
        self.data = data
        self.children = []
        self.parent = parent

    def add_child(self, obj):
        self.children.append(obj)
        
    def add_parent(self, obj):
        self.parent = obj
        
    def __str__(self, tabs=0):
        #set_trace()
        tab_spaces = str.join("", [" " for i in range(tabs)])
        return tab_spaces + "+-- Node: "+ str.join("|", self.data) + "\n"\
                + str.join("\n", [child.__str__(tabs+2) for child in self.children])

In [7]:
"""Preparing inputs
Parse indented lines of text into a tree. Children are indented & under the parent"""
#Parse SyntaxtNet output to sentence trees 

def parse_dep_tree_text(file_name='sick_train_sentenceA_tree.txt'):
    all_data=[]
    max_children = 0
    sentence_trees = []
    with open(file_name, 'r') as f:
        line = "placeholder"
        while not (line.strip() == ""):
            line = f.readline()
            #set_trace()
            if line.startswith("Input:") or line.startswith("Parse:"):
                continue
            elif "ROOT" in line and (line.index("ROOT") is len(line)-5):
                root_tokens = line.split()
                current_node = Node(root_tokens)
                sentence_trees.append(current_node)
                spaces = 0
                node_stack = []
                #set_trace()
                while not line.startswith("Input:"): 
                    line = f.readline()
                    if line.startswith("Input:") or line.startswith("Parse:"):
                        break
                    elif  line.strip() == "":
                        break
                    else:
                        #set_trace()
                        if line.index("+--") < spaces:
                            while line.index("+--") < spaces:
                                current_node, spaces = node_stack.pop()

                        if line.index("+--") > spaces:
                            line_copy = line
                            line_copy = line_copy.replace("|", "")
                            line_copy = line_copy.replace("+--", "")
                            tokens = line_copy.split()
                            new_node = Node(tokens, parent=current_node)
                            all_data.append(tokens)
                            current_node.add_child(new_node)
                            if len(current_node.children)> max_children:
                                max_children = len(current_node.children)
                            node_stack.append((current_node, spaces))
                            current_node = new_node
                            spaces = line.index("+--")

                        elif line.index("+--") == spaces:
                            line_copy = line
                            line_copy = line_copy.replace("|", "")
                            line_copy = line_copy.replace("+--", "")
                            tokens = line_copy.split()
                            all_data.append(tokens)
                            new_node = Node(tokens, parent=node_stack[-1][0])
                            node_stack[-1][0].add_child(new_node)
                            if len(node_stack[-1][0].children)> max_children:
                                max_children = len(node_stack[-1][0].children)
                            current_node = new_node
                            spaces = line.index("+--")
    return sentence_trees, max_children #a list of the roots nodes

In [8]:
"""Convert trees to a linear representation. Children are listed between the left and right 
marker in front of the parent. Each word is replaced by its id """
unknown_word = word_to_idx["UNKNOWN_WORD"]
left_marker = word_to_idx["LEFT_MARKER"]
right_marker = word_to_idx["RIGHT_MARKER"]
end_marker = word_to_idx["END_MARKER"]
def create_batches(trees, tree_batch_size = 25):
    max_sequence_length=0
    batches = []
    batches_lengths= []
    tree_batches = []
    for i in range(len(trees)//tree_batch_size):
        tree_batch = trees[i*tree_batch_size:(i+1)*tree_batch_size]
        tree_batches.append(tree_batch)
        batch = []
        batches.append(batch)
        batch_lengths = []
        batches_lengths.append(batch_lengths)
        for tree in tree_batch:
            result =[]
            batch.append(result)
            handle_node(tree, result)
            batch_lengths.append(len(result))
            if len(result) > max_sequence_length:
                max_sequence_length = len(result)
    
    return batches, tree_batches, max_sequence_length,batches_lengths
                
            
def handle_node(node, result):
    result.append(left_marker)
    word = node.data[0]
    if word in word_to_idx:
        result.append(word_to_idx[word])
    else:
        result.append(unknown_word)
        #print("Unknown word: "+word)
    if len(node.children)>0:
        
        for child in node.children:
            handle_node(child, result)
    result.append(right_marker)
        

In [9]:
"""Pad sequences with end markers"""
def pad_sequences(batches, max_sequence_length):
    for batch in batches:
        for sentence in batch:
            while len(sentence) < max_sequence_length :
                sentence.append(end_marker)

In [10]:
"""Function to load the target scores and split them into batches"""

def load_scores(file_name, batch_size):
    score_batches = []
    with open(file_name, 'r') as f:
        batch = []
        for line in f:
            if line and float(line):
                batch.append(float(line))
                
            if len(batch)== batch_size: 
                score_batches.append(batch)
                batch = []
    return score_batches

In [11]:
"""Convert the score into a set of probabilities over the classes"""

def convert_scores_to_p(scores_list):
    scores = np.array(scores_list) 
    num_of_classes = 5 #1, 2, .. , 4, 5
    p = np.zeros((len(scores), num_of_classes))
    for i, score in enumerate(scores): 
        floor = math.floor(score)
        if score == num_of_classes:
            p[i][num_of_classes-1] = 1
        else:
            p[i][floor] = score - floor  #floor+1-1  zero index adjustment
            p[i][floor-1] = floor - score + 1 - 0.015 #floor-1  zero index adjustment
    return p

In [12]:
"""Split the sentences into words and convert the words to their ids
The set of words by which to split the sentence can be found in the corresponding tree
so fetch the set of words first """
from IPython.core.debugger import set_trace
def create_sentence_batches(sentences, trees, tree_batch_size = 25):
    max_sequence_length=0
    batches = []
    batches_lengths= []
    tree_batches = []
    for i in range(len(trees)//tree_batch_size):
        tree_batch = trees[i*tree_batch_size:(i+1)*tree_batch_size]
        sentence_batch = sentences[i*tree_batch_size:(i+1)*tree_batch_size]
        batch = []
        batch_lengths = []
        batches.append(batch)
        batches_lengths.append(batch_lengths)
        for j, tree in enumerate(tree_batch):
            word_list =[]
            get_word_list(tree, word_list)
            #set_trace()
            sentence_ids = []
            batch.append(sentence_ids)
            ordered_word_list = sentence_batch[j].replace(",", " , ").replace(".", " . ").replace("n't", " n't").replace("'s", " 's ").split()
            
            for k in range(len(ordered_word_list)):
                word = ordered_word_list[k]
                if not word in word_list:
                    print("missing word: " + word)
                    set_trace()
                    for token in word_list:
                        if (not token in ordered_word_list) and token in word:
                            words = word.replace(token, " "+token+" ").split()
                            for half_word in words:
                                if half_word in word_to_idx:
                                    sentence_ids.append(word_to_idx[half_word])
                                else:
                                    sentence_ids.append(unknown_word)
                            break
                elif word in word_to_idx:
                    sentence_ids.append(word_to_idx[word])
                else:
                    sentence_ids.append(unknown_word)
            batch_lengths.append(len(sentence_ids))
            if len(sentence_ids) > max_sequence_length:
                max_sequence_length = len(sentence_ids)
    
    return batches, max_sequence_length, batches_lengths
                
            
def get_word_list(node, word_list):
    word = node.data[0]
    word_list.append(word)
    if len(node.children)>0:
        for child in node.children:
            get_word_list(child, word_list)

In [13]:
def load_lines(file):
    with open(file, 'r') as f: 
        contents = f.readlines()
        if len(contents[-1].strip())==0:
            contents.pop(-1)
    return contents

In [14]:
def create_index_offsets(batch_size, sequence_len, input_lengths):
    return np.array(range(batch_size))*sequence_len + input_lengths-1

In [15]:
hidden_size = 300
sequence_len = 100
num_layers = 2
batch_size = 25
data_type = tf.float64
output_size = 5 #21 classes
reg_lambda = 1e-4 #regularization parameter
max_children = 10
learn_rate = 0.05
max_grad_norm = 5
epoch_size = 4500

In [16]:
sentence_treesA, max_childrenA = parse_dep_tree_text(file_name='data/sick_train_sentenceA_tree.txt')
sentence_treesB, max_childrenB = parse_dep_tree_text(file_name='data/sick_train_sentenceB_tree.txt')

In [17]:
batchesA, tree_batchesA, max_sequence_lengthA, seq_lenA = create_batches(sentence_treesA, batch_size)
batchesB, tree_batchesB, max_sequence_lengthB, seq_lenB = create_batches(sentence_treesB, batch_size)

In [18]:
sequence_len = max(max_sequence_lengthA, max_sequence_lengthB)
sequence_len_tensor = tf.constant(sequence_len, dtype=tf.int32)

In [19]:
sequence_len

108

In [20]:
pad_sequences(batchesA, sequence_len)
pad_sequences(batchesB, sequence_len)

In [21]:
scores = load_scores('data/sick_train_score.txt', batch_size)

In [22]:
session = tf.Session()
sentence_inputs_A = tf.placeholder(tf.int32, shape=(None, sequence_len), name="sentence_inputs_A")
sentence_inputs_A_length = tf.placeholder(tf.int32, shape=(None, ), name="sentence_inputs_A_length")
serial_index_offsets_A = tf.placeholder(tf.int32, shape=(None, ), name="serial_index_offsets_A")

sentence_inputs_B = tf.placeholder(tf.int32, shape=(None, sequence_len), name="sentence_inputs_B")
sentence_inputs_B_length = tf.placeholder(tf.int32, shape=(None, ), name="sentence_inputs_B_length")
serial_index_offsets_B = tf.placeholder(tf.int32, shape=(None, ), name="serial_index_offsets_B")

target_score = tf.placeholder(data_type, shape=(None, output_size), name="target_scores")
target_score_scalar = tf.placeholder(data_type, shape=(None, ), name="target_scores")

embedding = tf.constant(embedding_matrix, dtype=data_type)
embedded_inputs_A = tf.nn.embedding_lookup(embedding, sentence_inputs_A)
embedded_inputs_B = tf.nn.embedding_lookup(embedding, sentence_inputs_B)


In [23]:
"""The model."""
def makeCells():
    with tf.variable_scope("layer_1"):
        cell1 = tf.contrib.rnn.BasicLSTMCell(
          hidden_size, forget_bias=1.0, state_is_tuple=True)
    with tf.variable_scope("layer_2"):
        cell2 = tf.contrib.rnn.BasicLSTMCell(
          hidden_size, forget_bias=1.0, state_is_tuple=True)
    return [cell1, cell2]

A =makeCells()
B =makeCells()
with tf.variable_scope("UnrolledStackedCells", reuse=tf.AUTO_REUSE):
    cellA = tf.contrib.rnn.MultiRNNCell(A, state_is_tuple=True)
    outputsA, final_stateA = tf.nn.dynamic_rnn(cellA, embedded_inputs_A,\
                                              dtype=data_type, sequence_length=sentence_inputs_A_length)
    
    cellB = tf.contrib.rnn.MultiRNNCell(B, state_is_tuple=True)
    outputsB, final_stateB = tf.nn.dynamic_rnn(cellB, embedded_inputs_B,\
                                              dtype=data_type, sequence_length=sentence_inputs_B_length)
    

In [24]:
outputsA

<tf.Tensor 'UnrolledStackedCells/rnn/transpose_1:0' shape=(?, 108, 300) dtype=float64>

In [25]:
serialized_outputsA = tf.reshape(outputsA, [-1, hidden_size])
terminal_outputsA = tf.gather(serialized_outputsA, serial_index_offsets_A)

serialized_outputsB = tf.reshape(outputsB, [-1, hidden_size])
terminal_outputsB = tf.gather(serialized_outputsB, serial_index_offsets_B)


In [26]:
h_abs_difference = tf.abs(tf.subtract(terminal_outputsA, terminal_outputsB))
h_elewise_product = tf.multiply(terminal_outputsA, terminal_outputsB)

In [27]:

W_h_abs_difference = tf.get_variable("W_h_abs_difference", [hidden_size, output_size], data_type)
W_h_elewise_product = tf.get_variable("W_h_elewise_product", [hidden_size, output_size], data_type)
B_h = tf.get_variable("B_h", [output_size], data_type)
h_s = tf.nn.xw_plus_b(h_abs_difference, W_h_abs_difference, B_h)
h_s = tf.add(tf.matmul(h_elewise_product, W_h_elewise_product), h_s)
h_s = tf.nn.sigmoid(h_s)

W_p = tf.get_variable("W_p", [output_size, output_size], data_type) 
B_p = tf.get_variable("B_p", [output_size], data_type)
p_hat = tf.nn.softmax(tf.nn.xw_plus_b(h_s, W_p, B_p))

In [28]:
y_p = tf.multiply(p_hat, tf.constant([1,2,3,4,5], dtype=tf.float64))
y_hat = tf.reduce_sum(y_p, 1)
MSE = tf.losses.mean_squared_error(target_score_scalar, y_hat)
pMSE = tf.cast(tf.losses.mean_squared_error(target_score, p_hat), data_type)

In [29]:

regularizer = tf.constant(0.0,dtype=data_type)
for var in tf.trainable_variables(): 
    regularizer = tf.add(regularizer, tf.nn.l2_loss(var))
loss = tf.add(pMSE, tf.multiply(tf.constant(reg_lambda, dtype=data_type), regularizer))

learning_rate = tf.Variable(learn_rate, trainable=False)
tvars = tf.trainable_variables()
grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars),
                                      max_grad_norm)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [30]:
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
train_op = optimizer.apply_gradients(
        zip(grads, tvars),
        global_step=tf.train.get_or_create_global_step())

TrainLoss_summary = tf.summary.scalar('TrainLoss', loss)
TestLoss_summary = tf.summary.scalar('TestLoss', loss)
MSE_summary = tf.summary.scalar('Test MSE -on Score Scalar', MSE)
pMSE_summary = tf.summary.scalar('Test pMSE -on probabilities', pMSE)

INFO:tensorflow:Summary name Test MSE -on Score Scalar is illegal; using Test_MSE_-on_Score_Scalar instead.
INFO:tensorflow:Summary name Test pMSE -on probabilities is illegal; using Test_pMSE_-on_probabilities instead.


In [31]:
def test_model(batchesA, batchesB, seq_lenA, seq_lenB, scores, output_file):
    output_scores=[]
    total_loss = 0
    feed_dict = {sentence_inputs_A:np.array(batchesA), sentence_inputs_A_length:np.array(seq_lenA), 
                 serial_index_offsets_A:create_index_offsets(len(batchesA), sequence_len, seq_lenA),
                 sentence_inputs_B:np.array(batchesB), sentence_inputs_B_length:np.array(seq_lenB),
                 serial_index_offsets_B:create_index_offsets(len(batchesB), sequence_len, seq_lenB),
                 target_score:np.array(convert_scores_to_p(scores)), target_score_scalar:np.array(scores) }


    fetches = {'loss': loss, 'y_hat': y_hat, 'TestLoss_summary':TestLoss_summary,
              'MSE_summary':MSE_summary, 'pMSE_summary':pMSE_summary}
    vals = session.run(fetches, feed_dict)
    total_loss = vals["loss"]
    output_scores.append(vals["y_hat"])


    #print("Loss %.3f" % (loss))

    #import csv
    """with open(output_file,'w') as resultFile:
        wr = csv.writer(resultFile, dialect='excel')
        for batch in output_scores:
        for score in batch: 
            wr.writerow([repr(score)])"""
    return total_loss, vals["TestLoss_summary"], vals["MSE_summary"], vals["pMSE_summary"]

In [32]:
test_batchesA, test_seq_lenA = None, None
test_batchesB, test_seq_lenB = None, None
test_scores = None
def load_tree_test_data_and_test_model(test_batchesA_, test_seq_lenA_,test_batchesB_, test_seq_lenB_,test_scores_):
    global test_batchesA, test_seq_lenA, test_batchesB, test_seq_lenB, test_scores
    if test_batchesA_ : 
        return test_model(test_batchesA_[0], test_batchesB_[0], test_seq_lenA_[0], test_seq_lenB_[0], test_scores_[0], 
                   "test_results.txt")
    else:        
        sentence_treesA, max_childrenA = parse_dep_tree_text(file_name='data/sick_trial_sentenceA_tree.txt')
        sentence_treesB, max_childrenB = parse_dep_tree_text(file_name='data/sick_trial_sentenceB_tree.txt')

        test_batchesA, test_tree_batchesA, max_sequence_lengthA, test_seq_lenA = create_batches(sentence_treesA,
                                                                                                len(sentence_treesA))
        test_batchesB, test_tree_batchesB, max_sequence_lengthB, test_seq_lenB = create_batches(sentence_treesB,
                                                                                                len(sentence_treesB))

        pad_sequences(test_batchesA, sequence_len)
        pad_sequences(test_batchesB, sequence_len)

        test_scores = load_scores('data/sick_trial_score.txt', len(sentence_treesA))
        return test_model(test_batchesA[0], test_batchesB[0], test_seq_lenA[0], test_seq_lenB[0], test_scores[0], 
                       "test_results.txt")

In [34]:
#Train model
start_time = time.time()
costs = 0.0
iters = 0
saver = tf.train.Saver()
#saver.restore(session, "./h_lin_tree_pMSE_train/SemanticRelatednessLSTM-h_pMSE.ckpt")
merged = tf.summary.merge_all()
writer = tf.summary.FileWriter("./h_lin_tree_pMSE_train", session.graph)
session.run(tf.global_variables_initializer())


In [None]:
start_time = time.time()

for cycles in range(10000):
    for step in range(epoch_size//batch_size):
        feed_dict = {sentence_inputs_A:np.array(batchesA[step]), sentence_inputs_A_length:np.array(seq_lenA[step]),
                     serial_index_offsets_A:create_index_offsets(len(batchesA[step]), sequence_len, seq_lenA[step]),
                     sentence_inputs_B:np.array(batchesB[step]), sentence_inputs_B_length:np.array(seq_lenB[step]),
                     serial_index_offsets_B:create_index_offsets(len(batchesB[step]), sequence_len, seq_lenB[step]),
                     target_score:np.array(convert_scores_to_p(scores[step])), target_score_scalar:np.array(scores[step])}

        fetches = {'loss': loss, 'train_op':train_op}
        vals = session.run(fetches, feed_dict)
        cost = vals["loss"]

        costs += cost
        iters +=  1

        if (cycles == 0 and step == 0 ) or (step % (epoch_size // 100) == 10):
            print("%.3f perplexity: %.3f speed: %.0f wps" %
                    (step * 1.0 / epoch_size, np.exp(costs / iters),
                    iters * batch_size * max(1, 1) /
                    (time.time() - start_time)))
            print("100*Loss %.3f" % (100*cost))
            
            global_step = cycles*epoch_size//batch_size+step
            TrainLoss_summary_val = session.run(TrainLoss_summary, feed_dict)
            writer.add_summary(TrainLoss_summary_val, global_step)
            
            total_test_loss, TestLoss_summary_val, MSE_summary_val, pMSE_summary_val = load_tree_test_data_and_test_model(test_batchesA, test_seq_lenA,test_batchesB, 
                                                                   test_seq_lenB,test_scores)
            writer.add_summary(TestLoss_summary_val, global_step)
            writer.add_summary(MSE_summary_val, global_step)
            writer.add_summary(pMSE_summary_val, global_step)
            
            print("total_test_loss %.7f" % (total_test_loss))
            save_path = saver.save(session, "./h_lin_tree_pMSE_train/SemanticRelatednessLSTM-h.ckpt")
            
            

0.000 perplexity: 1.198 speed: 18 wps
100*Loss 18.081
total_test_loss 0.1838260
0.002 perplexity: 1.195 speed: 71 wps
100*Loss 18.126
total_test_loss 0.1828777
0.012 perplexity: 1.196 speed: 141 wps
100*Loss 18.152
total_test_loss 0.1797041
0.022 perplexity: 1.197 speed: 168 wps
100*Loss 18.093
total_test_loss 0.1772605
0.032 perplexity: 1.195 speed: 163 wps
100*Loss 16.768
total_test_loss 0.1745587
0.002 perplexity: 1.193 speed: 160 wps
100*Loss 17.050
total_test_loss 0.1723964
0.012 perplexity: 1.192 speed: 164 wps
100*Loss 16.807
total_test_loss 0.1704575
0.022 perplexity: 1.191 speed: 171 wps
100*Loss 17.574
total_test_loss 0.1691119
0.032 perplexity: 1.190 speed: 168 wps
100*Loss 15.076
total_test_loss 0.1676572
0.002 perplexity: 1.188 speed: 165 wps
100*Loss 16.437
total_test_loss 0.1666839
0.012 perplexity: 1.187 speed: 167 wps
100*Loss 16.003
total_test_loss 0.1659916
0.022 perplexity: 1.187 speed: 171 wps
100*Loss 17.383
total_test_loss 0.1656706
0.032 perplexity: 1.186 speed:

total_test_loss 0.1608910
0.012 perplexity: 1.176 speed: 173 wps
100*Loss 15.112
total_test_loss 0.1608896
0.022 perplexity: 1.176 speed: 173 wps
100*Loss 17.193
total_test_loss 0.1607598
0.032 perplexity: 1.176 speed: 173 wps
100*Loss 13.148
total_test_loss 0.1607517
0.002 perplexity: 1.176 speed: 173 wps
100*Loss 15.641
total_test_loss 0.1607372
0.012 perplexity: 1.176 speed: 173 wps
100*Loss 15.098
total_test_loss 0.1607412
0.022 perplexity: 1.176 speed: 174 wps
100*Loss 17.177
total_test_loss 0.1606079
0.032 perplexity: 1.176 speed: 173 wps
100*Loss 13.130
total_test_loss 0.1606019
0.002 perplexity: 1.176 speed: 173 wps
100*Loss 15.624
total_test_loss 0.1605836
0.012 perplexity: 1.176 speed: 173 wps
100*Loss 15.084
total_test_loss 0.1605929
0.022 perplexity: 1.176 speed: 174 wps
100*Loss 17.161
total_test_loss 0.1604565
0.032 perplexity: 1.176 speed: 174 wps
100*Loss 13.111
total_test_loss 0.1604524
0.002 perplexity: 1.176 speed: 173 wps
100*Loss 15.608
total_test_loss 0.1604304
0.

In [56]:
sentence_batches_A = None
sentence_batches_B = None
sentence_len_A = None
sentence_len_B = None
test_scores = None 

def load_sentence_test_data_and_test_model(sentence_treesA, sentence_treesB, sentenceA_file_name, sentenceB_file_name, 
                                  score_file_name, output_file_name):
    if sentence_batches_A:
        test_model(sentence_batches_A, sentence_batches_B, sentence_len_A, sentence_len_B, test_scores, 
               output_file_name)
    else:
        
        if not sentenceA_file_name:
            sentenceA_file_name = 'data/sick_trial_sentenceA.txt'
        if not sentenceB_file_name:
            sentenceB_file_name = 'data/sick_trial_sentenceB.txt'
        if not score_file_name:
            score_file_name = 'data/sick_trial_score.txt'
        if not output_file_name:
            output_file_name = "./data/sick_trial_score_sentence_predict.csv" 

        sentencesA = load_lines(sentenceA_file_name)
        sentencesB = load_lines(sentenceB_file_name)

        sentence_batches_A, max_sentence_lengthA, sentence_len_A = create_sentence_batches(sentencesA, sentence_treesA, batch_size)
        sentence_batches_B, max_sentence_lengthB, sentence_len_B = create_sentence_batches(sentencesB, sentence_treesB, batch_size)

        pad_sequences(sentence_batches_A, sequence_len)
        pad_sequences(sentence_batches_B, sequence_len)

        test_scores = load_scores(score_file_name, batch_size)

        test_model(sentence_batches_A, sentence_batches_B, sentence_len_A, sentence_len_B, test_scores, 
                   output_file_name)