In [1]:
import codecs
import functools
import os
import tempfile
import zipfile

import numpy as np
import tensorflow as tf
import time
import numpy as np
from six.moves import urllib
import math


  from ._conv import register_converters as _register_converters


In [2]:
data_dir = tempfile.mkdtemp()
print('saving files to %s' % data_dir)

saving files to /var/folders/_4/p7lkljg13nvbgmfj_qvgf3vm0000gn/T/tmp9r03rwj7


In [3]:
def download_and_unzip(url_base, zip_name, *file_names):
    zip_path = os.path.join(data_dir, zip_name)
    url = url_base + zip_name
    print('downloading %s to %s' % (url, zip_path))
    urllib.request.urlretrieve(url, zip_path)
    out_paths = []
    with zipfile.ZipFile(zip_path, 'r') as f:
        for file_name in file_names:
            print('extracting %s' % file_name)
            out_paths.append(f.extract(file_name, path=data_dir))
    return out_paths

In [4]:
#full_glove_path, = download_and_unzip(
#  'http://nlp.stanford.edu/data/', 'glove.840B.300d.zip',
#  'glove.840B.300d.txt')

In [5]:
filtered_glove_path = 'data/sick_filtered_glove2.txt'

def filter_glove():
    vocab = set()
    # Download the full set of unlabeled sentences separated by '|'.
    sentence_path = ['sick_train_sentenceA_tree.txt', 'sick_train_sentenceB_tree.txt',
                      'sick_trial_sentenceA_tree.txt', 'sick_trial_sentenceB_tree.txt',]
    for path in sentence_path:
        with open(path, 'r') as f:
            for line in f:
                # Drop the trailing newline and strip backslashes. Split into words.
                vocab.update(line.strip().split())
    nread = 0
    nwrote = 0
    with codecs.open(full_glove_path, encoding='utf-8') as f:
        with codecs.open(filtered_glove_path, 'w', encoding='utf-8') as out:
            for line in f:
                nread += 1
                line = line.strip()
                if not line: continue
                if line.split(u' ', 1)[0] in vocab:
                    out.write(line + '\n')
                    nwrote += 1
    print('read %s lines, wrote %s' % (nread, nwrote))
    return vocab

In [6]:
def load_embeddings(embedding_path):
    """Loads embedings, returns weight matrix and dict from words to indices."""
    print('loading word embeddings from %s' % embedding_path)
    weight_vectors = []
    word_idx = {}
    with codecs.open(embedding_path, encoding='utf-8') as f:
        for line in f:
            word, vec = line.split(u' ', 1)
            word_idx[word] = len(weight_vectors)
            weight_vectors.append(np.array(vec.split(), dtype=np.float32))
    # Annoying implementation detail; '(' and ')' are replaced by '-LRB-' and
    # '-RRB-' respectively in the parse-trees.
    if u'(' in word_idx:
        word_idx[u'-LRB-'] = word_idx.pop(u'(')
    if u')' in word_idx:
        word_idx[u'-RRB-'] = word_idx.pop(u')')
        
    # Random embedding vector for UNKNOWN-WORD.
    weight_vectors.append(np.random.uniform(
      -0.05, 0.05, weight_vectors[0].shape).astype(np.float32))
    word_idx["UNKNOWN_WORD"] = len(weight_vectors)-1
    
    # Random embedding vector for left_marker.
    weight_vectors.append(np.random.uniform(
      -0.05, 0.05, weight_vectors[0].shape).astype(np.float32))
    word_idx["LEFT_MARKER"] = len(weight_vectors)-1
    
    # Random embedding vector for right_marker.
    weight_vectors.append(np.random.uniform(
      -0.05, 0.05, weight_vectors[0].shape).astype(np.float32))
    word_idx["RIGHT_MARKER"] = len(weight_vectors)-1
    
    # Random embedding vector for right_marker.
    weight_vectors.append(np.random.uniform(
      -0.05, 0.05, weight_vectors[0].shape).astype(np.float32))
    word_idx["END_MARKER"] = len(weight_vectors)-1
    
    return np.stack(weight_vectors), word_idx

In [7]:
embedding_matrix, word_to_idx = load_embeddings(filtered_glove_path)

loading word embeddings from sick_filtered_glove2.txt


In [8]:
"""Tree node class"""
class Node(object):
    def __init__(self, data, parent=None):
        self.data = data
        self.children = []
        self.parent = parent

    def add_child(self, obj):
        self.children.append(obj)
        
    def add_parent(self, obj):
        self.parent = obj
        
    def __str__(self, tabs=0):
        #set_trace()
        tab_spaces = str.join("", [" " for i in range(tabs)])
        return tab_spaces + "+-- Node: "+ str.join("|", self.data) + "\n"\
                + str.join("\n", [child.__str__(tabs+2) for child in self.children])

In [9]:
"""Preparing inputs"""
#Parse SyntaxtNet output to sentence trees 

def parse_dep_tree_text(file_name='sick_train_sentenceA_tree.txt'):
    all_data=[]
    max_children = 0
    sentence_trees = []
    with open(file_name, 'r') as f:
        line = "placeholder"
        while not (line.strip() == ""):
            line = f.readline()
            #set_trace()
            if line.startswith("Input:") or line.startswith("Parse:"):
                continue
            elif "ROOT" in line and (line.index("ROOT") is len(line)-5):
                root_tokens = line.split()
                current_node = Node(root_tokens)
                sentence_trees.append(current_node)
                spaces = 0
                node_stack = []
                #set_trace()
                while not line.startswith("Input:"): 
                    line = f.readline()
                    if line.startswith("Input:") or line.startswith("Parse:"):
                        break
                    elif  line.strip() == "":
                        break
                    else:
                        #set_trace()
                        if line.index("+--") < spaces:
                            while line.index("+--") < spaces:
                                current_node, spaces = node_stack.pop()

                        if line.index("+--") > spaces:
                            line_copy = line
                            line_copy = line_copy.replace("|", "")
                            line_copy = line_copy.replace("+--", "")
                            tokens = line_copy.split()
                            new_node = Node(tokens, parent=current_node)
                            all_data.append(tokens)
                            current_node.add_child(new_node)
                            if len(current_node.children)> max_children:
                                max_children = len(current_node.children)
                            node_stack.append((current_node, spaces))
                            current_node = new_node
                            spaces = line.index("+--")

                        elif line.index("+--") == spaces:
                            line_copy = line
                            line_copy = line_copy.replace("|", "")
                            line_copy = line_copy.replace("+--", "")
                            tokens = line_copy.split()
                            all_data.append(tokens)
                            new_node = Node(tokens, parent=node_stack[-1][0])
                            node_stack[-1][0].add_child(new_node)
                            if len(node_stack[-1][0].children)> max_children:
                                max_children = len(node_stack[-1][0].children)
                            current_node = new_node
                            spaces = line.index("+--")
    return sentence_trees, max_children #a list of the roots nodes

In [10]:
unknown_word = word_to_idx["UNKNOWN_WORD"]
left_marker = word_to_idx["LEFT_MARKER"]
right_marker = word_to_idx["RIGHT_MARKER"]
end_marker = word_to_idx["END_MARKER"]
def create_batches(trees, tree_batch_size = 25):
    max_sequence_length=0
    batches = []
    tree_batches = []
    for i in range(len(trees)//tree_batch_size):
        tree_batch = trees[i*tree_batch_size:(i+1)*tree_batch_size]
        tree_batches.append(tree_batch)
        batch = []
        batches.append(batch)
        for tree in tree_batch:
            result =[]
            batch.append(result)
            handle_node(tree, result)
            if len(result) > max_sequence_length:
                max_sequence_length = len(result)
    for batch in batches:
        for sentence in batch:
            while len(sentence) < max_sequence_length + 3:
                sentence.append(end_marker)
    max_sequence_length += 3
    return batches, tree_batches, max_sequence_length
                
            
def handle_node(node, result):
    result.append(left_marker)
    word = node.data[0]
    if word in word_to_idx:
        result.append(word_to_idx[word])
    else:
        result.append(unknown_word)
    if len(node.children)>0:
        for child in node.children:
            handle_node(child, result)
    result.append(right_marker)
        

In [31]:
def pad_sequences(batches, max_sequence_length):
    for batch in batches:
        for sentence in batch:
            while len(sentence) < max_sequence_length :
                sentence.append(end_marker)

In [11]:
"""Function to load the target scores and split them into batches"""

def load_scores(file_name, batch_size):
    score_batches = []
    with open(file_name, 'r') as f:
        batch = []
        for line in f:
            if line and float(line):
                batch.append(float(line))
                
            if len(batch)== batch_size: 
                score_batches.append(batch)
                batch = []
    return score_batches

In [12]:
def convert_scores_to_p(scores_list):
    scores = np.array(scores_list) 
    num_of_classes = 5 #1, 2, .. , 4, 5
    p = np.zeros((len(scores), num_of_classes))
    for i, score in enumerate(scores):        
        if score == num_of_classes:
            p[i][num_of_classes-1] = 1
        else:
            floor = math.floor(score) 
            ceil = math.ceil(score) 
            p[i][floor] = score - floor
            p[i][floor-1] = floor - score + 1
    return p


In [13]:
hidden_size = 300
sequence_len = 100
num_layers = 2
batch_size = 25
data_type = tf.float64
output_size = 5 #21 classes
reg_lambda = 1e-4 #regularization parameter
max_children = 10
learn_rate = 0.05
max_grad_norm = 5
epoch_size = 4500

In [14]:
sentence_treesA, max_childrenA = parse_dep_tree_text(file_name='data/sick_train_sentenceA_tree.txt')
sentence_treesB, max_childrenB = parse_dep_tree_text(file_name='data/sick_train_sentenceB_tree.txt')

In [32]:
batchesA, tree_batchesA, max_sequence_lengthA = create_batches(sentence_treesA, batch_size)
batchesB, tree_batchesB, max_sequence_lengthB = create_batches(sentence_treesB, batch_size)

In [33]:
sequence_len = max(max_sequence_lengthA, max_sequence_lengthB, 100)

In [38]:
pad_sequences(batchesA, sequence_len)
pad_sequences(batchesB, sequence_len)

In [39]:
len(batchesA[0][0])

111

In [18]:
scores = load_scores('data/sick_train_score.txt', batch_size)

In [19]:
session = tf.Session()
sentence_inputs_A = tf.placeholder(tf.int32, shape=(batch_size, sequence_len), name="sentence_inputs_A")
sentence_inputs_B = tf.placeholder(tf.int32, shape=(batch_size, sequence_len), name="sentence_inputs_B")
target_score = tf.placeholder(data_type, shape=(batch_size, output_size), name="target_scores")
embedding = tf.constant(embedding_matrix, dtype=data_type)
embedded_inputs_A = tf.nn.embedding_lookup(embedding, sentence_inputs_A)
embedded_inputs_B = tf.nn.embedding_lookup(embedding, sentence_inputs_B)


In [20]:
"""The model."""
def makeCells():
    with tf.variable_scope("layer_1"):
        cell1 = tf.contrib.rnn.BasicLSTMCell(
          hidden_size, forget_bias=1.0, state_is_tuple=True)
    with tf.variable_scope("layer_2"):
        cell2 = tf.contrib.rnn.BasicLSTMCell(
          hidden_size, forget_bias=1.0, state_is_tuple=True)
    return [cell1, cell2]

A =makeCells()
B =makeCells()
with tf.variable_scope("xx", reuse=tf.AUTO_REUSE):
    cellA = tf.contrib.rnn.MultiRNNCell(A, state_is_tuple=True)
    initial_stateA = cellA.zero_state(batch_size, data_type)
    inputs_A = tf.unstack(embedded_inputs_A, num=sequence_len, axis=1)
    unstacked_outputsA, final_stateA = tf.nn.static_rnn(cellA, inputs_A,\
                                              initial_state=initial_stateA,\
                                              dtype=data_type)
    outputsA = tf.reshape(tf.concat(unstacked_outputsA, 1), [-1, hidden_size])

    cellB = tf.contrib.rnn.MultiRNNCell(B, state_is_tuple=True)
    initial_stateB = cellB.zero_state(batch_size, data_type)
    inputs_B = tf.unstack(embedded_inputs_B, num=sequence_len, axis=1)
    unstacked_outputsB, final_stateB = tf.nn.static_rnn(cellB, inputs_B, initial_state=initial_stateB,\
                                              dtype=data_type)
    outputsB = tf.reshape(tf.concat(unstacked_outputsB, 1), [-1, hidden_size])


In [21]:
outputsB = tf.reshape(tf.concat(unstacked_outputsB, 1), [25, sequence_len * hidden_size])
outputsA = tf.reshape(tf.concat(unstacked_outputsA, 1), [25, sequence_len * hidden_size])

layerA = tf.layers.dense(outputsA, hidden_size, name='layerA', reuse=False)
layerB = tf.layers.dense(outputsB, hidden_size, name='layerB', reuse=False)


In [22]:
h_abs_difference = tf.abs(tf.subtract(layerA, layerB))
h_elewise_product = tf.multiply(layerA, layerB)

In [23]:

W_h_abs_difference = tf.get_variable("W_h_abs_difference", [hidden_size, output_size], data_type)
W_h_elewise_product = tf.get_variable("W_h_elewise_product", [hidden_size, output_size], data_type)
B_h = tf.get_variable("B_h", [output_size], data_type)
h_s = tf.nn.xw_plus_b(h_abs_difference, W_h_abs_difference, B_h)
h_s = tf.add(tf.matmul(h_elewise_product, W_h_elewise_product), h_s)
h_s = tf.nn.sigmoid(h_s)

W_p = tf.get_variable("W_p", [output_size, output_size], data_type) #TODO what's the correct shape??
B_p = tf.get_variable("B_p", [output_size], data_type)
p_hat = tf.nn.softmax(tf.nn.xw_plus_b(h_s, W_p, B_p))

In [25]:
#p_over_p_hat

In [26]:
p_over_p_hat = tf.div(target_score, p_hat)
KL = tf.reduce_mean(tf.reduce_sum(tf.multiply(target_score, p_over_p_hat), 1))
regularizer = tf.constant(0.0,dtype=data_type)
for var in tf.trainable_variables(): 
    regularizer = tf.add(regularizer, tf.nn.l2_loss(var))
loss = KL + reg_lambda*regularizer

learning_rate = tf.Variable(learn_rate, trainable=False)
tvars = tf.trainable_variables()
grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars),
                                      max_grad_norm)

In [27]:
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
train_op = optimizer.apply_gradients(
        zip(grads, tvars),
        global_step=tf.train.get_or_create_global_step())

In [28]:
#Train model
start_time = time.time()
costs = 0.0
iters = 0
writer = tf.summary.FileWriter("./event_and_checkpoints/", session.graph)
saver = tf.train.Saver()
stateA = session.run(initial_stateA)
stateB = session.run(initial_stateB)
session.run(tf.global_variables_initializer())

In [40]:
start_time = time.time()

for step in range(epoch_size):
    feed_dict = {sentence_inputs_A:np.array(batchesA[step]), sentence_inputs_B:np.array(batchesB[step]), 
                 target_score:np.array(convert_scores_to_p(scores[step]))}
    for i, (c, h) in enumerate(initial_stateA):
        feed_dict[c] = stateA[i].c
        feed_dict[h] = stateA[i].h
    for i, (c, h) in enumerate(initial_stateB):
        feed_dict[c] = stateB[i].c
        feed_dict[h] = stateB[i].h
    
    fetches = {'loss': loss, 'train_op':train_op}
    vals = session.run(fetches, feed_dict)
    cost = vals["loss"]

    costs += cost
    iters +=  1

    if step % (epoch_size // 100) == 10:
        print("%.3f perplexity: %.3f speed: %.0f wps" %
                (step * 1.0 / epoch_size, np.exp(costs / iters),
                iters * batch_size * max(1, 1) /
                (time.time() - start_time)))
        print("100*Loss %.3f" % (100*cost))
        save_path = saver.save(session, "./event_and_checkpoints/SemanticRelatednessLSTM.ckpt")

loop
loop
loop
loop
loop


In [45]:
epoch_size // 100

45

In [42]:
fetches

{'loss': <tf.Tensor 'add_14:0' shape=() dtype=float64>,
 'train_op': <tf.Operation 'GradientDescent' type=AssignAdd>}