In [1]:
import codecs
import functools
import os
import tempfile
import zipfile

from nltk.tokenize import sexpr
import numpy as np
from six.moves import urllib
import tensorflow as tf
import ChildSumTree as cst

  from ._conv import register_converters as _register_converters


In [2]:
data_dir = tempfile.mkdtemp()
print('saving files to %s' % data_dir)

saving files to /var/folders/_4/p7lkljg13nvbgmfj_qvgf3vm0000gn/T/tmp68o_vrpu


In [3]:
def download_and_unzip(url_base, zip_name, *file_names):
  zip_path = os.path.join(data_dir, zip_name)
  url = url_base + zip_name
  print('downloading %s to %s' % (url, zip_path))
  urllib.request.urlretrieve(url, zip_path)
  out_paths = []
  with zipfile.ZipFile(zip_path, 'r') as f:
    for file_name in file_names:
      print('extracting %s' % file_name)
      out_paths.append(f.extract(file_name, path=data_dir))
  return out_paths

In [4]:
full_glove_path, = download_and_unzip(
  'http://nlp.stanford.edu/data/', 'glove.840B.300d.zip',
  'glove.840B.300d.txt')

downloading http://nlp.stanford.edu/data/glove.840B.300d.zip to /var/folders/_4/p7lkljg13nvbgmfj_qvgf3vm0000gn/T/tmpwiweseip/glove.840B.300d.zip
extracting glove.840B.300d.txt


In [3]:
filtered_glove_path = 'sick_filtered_glove2.txt'

In [4]:
def filter_glove():
    vocab = set()
    # Download the full set of unlabeled sentences separated by '|'.
    sentence_path = ['sick_train_sentenceA_tree.txt', 'sick_train_sentenceB_tree.txt',
                      'sick_trial_sentenceA_tree.txt', 'sick_trial_sentenceB_tree.txt',]
    for path in sentence_path:
        with open(path, 'r') as f:
            for line in f:
                # Drop the trailing newline and strip backslashes. Split into words.
                vocab.update(line.strip().split())
    nread = 0
    nwrote = 0
    with codecs.open(full_glove_path, encoding='utf-8') as f:
        with codecs.open(filtered_glove_path, 'w', encoding='utf-8') as out:
            for line in f:
                nread += 1
                line = line.strip()
                if not line: continue
                if line.split(u' ', 1)[0] in vocab:
                    out.write(line + '\n')
                    nwrote += 1
    print('read %s lines, wrote %s' % (nread, nwrote))
    return vocab

In [6]:
vocab = filter_glove()

NameError: name 'full_glove_path' is not defined

In [4]:
def load_embeddings(embedding_path):
    """Loads embedings, returns weight matrix and dict from words to indices."""
    print('loading word embeddings from %s' % embedding_path)
    weight_vectors = []
    word_idx = {}
    with codecs.open(embedding_path, encoding='utf-8') as f:
        for line in f:
            word, vec = line.split(u' ', 1)
            word_idx[word] = len(weight_vectors)
            weight_vectors.append(np.array(vec.split(), dtype=np.float32))
    # Annoying implementation detail; '(' and ')' are replaced by '-LRB-' and
    # '-RRB-' respectively in the parse-trees.
    if u'(' in word_idx:
        word_idx[u'-LRB-'] = word_idx.pop(u'(')
    if u')' in word_idx:
        word_idx[u'-RRB-'] = word_idx.pop(u')')
    # Random embedding vector for unknown words.
    weight_vectors.append(np.random.uniform(
      -0.05, 0.05, weight_vectors[0].shape).astype(np.float32))
    return np.stack(weight_vectors), word_idx

In [5]:
embedding_matrix, word_to_idx = load_embeddings(filtered_glove_path)
        

loading word embeddings from sick_filtered_glove2.txt


In [6]:
embedding_matrix.shape

(2336, 300)

In [9]:
input_batch

<tf.Tensor 'embedding_lookup:0' shape=(25, 300) dtype=float64>

In [11]:
final_state2

LSTMStateTuple(c=<tf.Tensor 'Layer2/Add_28:0' shape=(25, 300) dtype=float64>, h=<tf.Tensor 'Layer2/Mul_11:0' shape=(25, 300) dtype=float64>)

In [7]:
"""Tree node class"""
class Node(object):
    def __init__(self, data, parent=None):
        self.data = data
        self.children = []
        self.parent = parent

    def add_child(self, obj):
        self.children.append(obj)
        
    def add_parent(self, obj):
        self.parent = obj
        
    def __str__(self, tabs=0):
        #set_trace()
        tab_spaces = str.join("", [" " for i in range(tabs)])
        return tab_spaces + "+-- Node: "+ str.join("|", self.data) + "\n"\
                + str.join("\n", [child.__str__(tabs+2) for child in self.children])

In [8]:
"""Preparing inputs"""
#Parse SyntaxtNet output to sentence trees 

def parse_dep_tree_text(file_name='sick_train_sentenceA_tree.txt'):
    all_data=[]
    max_children = 0
    sentence_trees = []
    with open(file_name, 'r') as f:
        line = "placeholder"
        while not (line.strip() == ""):
            line = f.readline()
            #set_trace()
            if line.startswith("Input:") or line.startswith("Parse:"):
                continue
            elif "ROOT" in line and (line.index("ROOT") is len(line)-5):
                root_tokens = line.split()
                current_node = Node(root_tokens)
                sentence_trees.append(current_node)
                spaces = 0
                node_stack = []
                #set_trace()
                while not line.startswith("Input:"): 
                    line = f.readline()
                    if line.startswith("Input:") or line.startswith("Parse:"):
                        break
                    elif  line.strip() == "":
                        break
                    else:
                        #set_trace()
                        if line.index("+--") < spaces:
                            while line.index("+--") < spaces:
                                current_node, spaces = node_stack.pop()

                        if line.index("+--") > spaces:
                            line_copy = line
                            line_copy = line_copy.replace("|", "")
                            line_copy = line_copy.replace("+--", "")
                            tokens = line_copy.split()
                            new_node = Node(tokens, parent=current_node)
                            all_data.append(tokens)
                            current_node.add_child(new_node)
                            if len(current_node.children)> max_children:
                                max_children = len(current_node.children)
                            node_stack.append((current_node, spaces))
                            current_node = new_node
                            spaces = line.index("+--")

                        elif line.index("+--") == spaces:
                            line_copy = line
                            line_copy = line_copy.replace("|", "")
                            line_copy = line_copy.replace("+--", "")
                            tokens = line_copy.split()
                            all_data.append(tokens)
                            new_node = Node(tokens, parent=node_stack[-1][0])
                            node_stack[-1][0].add_child(new_node)
                            if len(node_stack[-1][0].children)> max_children:
                                max_children = len(node_stack[-1][0].children)
                            current_node = new_node
                            spaces = line.index("+--")
    return sentence_trees, max_children #a list of the roots nodes

In [9]:
"""split the tree nodes into batches and stages/generations of children"""
#TODO Shuffle the tree lists and the scores
def create_batches(trees, tree_batch_size = 25):
    tree_batches = []
    stage_batches = []
    max_gen_length = 0
    sentence_trees = np.array(trees)
    for i in range(len(sentence_trees)//tree_batch_size):
        #create a batch of tree_batch_size trees
        tree_batch = sentence_trees[i*tree_batch_size:(i+1)*tree_batch_size] 
        tree_batches.append(tree_batch)
        stage_batch = []
        stage_batches.append(stage_batch)
        for tree in tree_batch:
            nodes = [tree]
            generations = []
            while len(nodes) > 0:
                generations.append([]+nodes)
                next_nodes=[]
                for node in nodes:
                    next_nodes.extend(node.children)
                nodes = next_nodes
            current_stage = 0
            for generation in reversed(generations):
                if len(stage_batch)<current_stage+1:
                    stage_batch.append(generation)
                else:
                    stage_batch[current_stage].extend(generation)
                if len(stage_batch[current_stage])>max_gen_length:
                    max_gen_length = len(stage_batch[current_stage])
                current_stage += 1
    return stage_batches, tree_batches, max_gen_length

In [10]:
#TODO each node must know it's location 

def pad_batches(stage_batches, generation_size):
    for sb_idx, stage_batch in enumerate(stage_batches):
        for g_idx, generation in enumerate(stage_batch):
            while (not len(generation)==0) and len(generation) < generation_size:
                generation.append(Node(['.'], parent=None))
                

In [11]:
"""Function to load the target scores and split them into batches"""
from IPython.core.debugger import set_trace

def load_scores(file_name, batch_size):
    score_batches = []
    with open(file_name, 'r') as f:
        batch = []
        for line in f:
            if line and float(line):
                batch.append(float(line))
                
            if len(batch)== batch_size: 
                score_batches.append(batch)
                batch = []
    return score_batches
            

In [19]:
len(load_scores('sick_train_score.txt', 25))

180

In [12]:
"""Convert words to ids
 Takes batches of tree nodes arranged in stages/generations
 returns the same data structure as the input except instead of nodes there are ids"""
def covert_words_ids(stage_batches, word_to_idx, unknown_word_id):    
    stage_batches_ids = []
    for batch_idx, stage_batch in enumerate(stage_batches):
        stage_batch_ids = []
        stage_batches_ids.append(stage_batch_ids)
        for gen_idx, generation in enumerate(stage_batch):
            generation_ids = []
            stage_batch_ids.append(generation_ids)
            for node_idx, node in enumerate(generation):
                word = node.data[0]
                node.data.append((batch_idx, gen_idx, node_idx))
                if word in word_to_idx:
                    node_word_id = word_to_idx[word]
                    generation_ids.append(node_word_id)
                else:
                    generation_ids.append(unknown_word_id)
                    print("unknown: "+word)
    return stage_batches_ids
            

In [13]:
"""takes the target score array and converts them to probabilities """
def convert_scores_to_p(scores, num_of_classes=5):
    p = np.zeros((len(scores), num_of_classes*(num_of_classes-1)+1))
    for i, score in enumerate(scores):        
        sim = score * (num_of_classes - 1) + 1
        floor = math.floor(sim) 
        ceil = math.ceil(sim) 
        if math.ceil(sim) == math.floor(sim):
            p[i][ceil] = 1
        else:
            p[i][floor] = ceil - sim
            p[i][ceil] = sim - floor
    return p

In [14]:
hidden_size = 300
num_layers = 2
batch_size = 25
data_type = tf.float64
output_size = 21 #21 classes
reg_lambda = 1e-4 #regularization parameter
max_children = 10

In [15]:
sentence_trees, max_children = parse_dep_tree_text(file_name='sick_train_sentenceA_tree.txt')

In [16]:
stage_batches, tree_batches, max_gen_length = create_batches(sentence_trees, tree_batch_size = 25)
batch_size=max_gen_length

In [17]:
pad_batches(stage_batches, max_gen_length)

In [18]:
unknown_word_id = len(embedding_matrix)-1
stage_batches_ids = covert_words_ids(stage_batches, word_to_idx, unknown_word_id)

unknown: unstitching
unknown: uninterestedly
unknown: midspeech


In [19]:
session = tf.Session()
inputs_idx = tf.placeholder(tf.int32, shape=(batch_size, ), name="inputs_idx")
embedding = tf.constant(embedding_matrix, dtype=data_type)
input_batch = tf.nn.embedding_lookup(embedding, inputs_idx)

In [20]:
"""The model."""
proto_cell = tf.contrib.rnn.BasicLSTMCell(
          hidden_size, forget_bias=0.0, state_is_tuple=True,
          reuse=False)

cell_layer_1 = cst.ChildSumTreeLSTMCell(hidden_size)
initial_state1 = [proto_cell.zero_state(batch_size, data_type) for i in range(max_children)]
outputs1, final_state1 = cell_layer_1(input_batch, state=initial_state1, scope='Layer1')
cell_layer_2 = cst.ChildSumTreeLSTMCell(hidden_size)
initial_state2 = [proto_cell.zero_state(batch_size, data_type) for i in range(max_children)]
outputs2, final_state2 = cell_layer_2(outputs1, state=initial_state2, scope='Layer2')

In [21]:
h_abs_difference = tf.placeholder(data_type, name="h_abs_difference")
h_elewise_product = tf.placeholder(data_type, name="h_elewise_product")

W_h_abs_difference = tf.get_variable("W_h_abs_difference", [hidden_size, output_size], data_type)
W_h_elewise_product = tf.get_variable("W_h_elewise_product", [hidden_size, output_size], data_type)
B_h = tf.get_variable("B_h", [output_size], data_type)
h_s = tf.nn.xw_plus_b(h_abs_difference, W_h_abs_difference, B_h)
h_s = tf.add(tf.matmul(h_elewise_product, W_h_elewise_product), h_s)
h_s = tf.nn.sigmoid(h_s)

W_p = tf.get_variable("W_p", [output_size, output_size], data_type) #TODO what's the correct shape??
B_p = tf.get_variable("B_p", [output_size], data_type)
p_hat = tf.nn.softmax(tf.nn.xw_plus_b(h_s, W_p, B_p))

#r = np.linspace(1.0,5.0,21) #1.0, 1.1, 1.2 ... 4.9, 5.0
#y_hat = tf.matmul(r, p_hat)


In [33]:
gen_length = 109
for sb_idx, stage_batch in enumerate(stage_batches):
    for g_idx, generation in enumerate(stage_batch):
        if len(generation)<gen_length:
            gen_length = len(generation)
print(gen_length)
            

1


In [34]:
nodes = 0
for sb_idx, stage_batch in enumerate(stage_batches):
    for g_idx, generation in enumerate(stage_batch):
        nodes += len(generation)
print(nodes)

43875


In [36]:
node_count = 0
for sb_idx, batch in enumerate(tree_batches):
    for g_idx, node in enumerate(batch):
        node_q = [node]
        while len(node_q)>0:
            current_node = node_q.pop(0)
            node_q.extend(current_node.children)
            node_count +=1
print(nodes)

0


In [22]:
"""Feed the data in, run the session and prepare the final state for the next generation 
   Repeat until every root in the batch has an output
   Calculate the loss and optimize """
epochs = 1000
writer = tf.summary.FileWriter("./", session.graph)
state1 = session.run(initial_state1)
state2 = session.run(initial_state2)

In [25]:
state1

[LSTMStateTuple(c=array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]]), h=array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])),
 LSTMStateTuple(c=array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]]), h=array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0.

In [28]:
session.run(tf.global_variables_initializer())
 
for epoch_idx in range(epochs): 
    for batch_idx, batch in enumerate(stage_batches_ids): 
        for gen_idx, generation in enumerate(batch):
            """Feed generation in and get output and states"""
            feed_dict = {inputs_idx:generation}
            if gen_idx == 0: 
                """Use zerostate"""
                
                for child_i, (c, h) in enumerate(initial_state1):
                    feed_dict[c] = state1[child_i].c
                    feed_dict[h] = state1[child_i].h
                for child_i, (c, h) in enumerate(initial_state2):
                    feed_dict[c] = state2[child_i].c
                    feed_dict[h] = state2[child_i].h
            else: 
                """generate the state from the children of the nodes in the next batch"""
                zero_state1 = session.run(initial_state1) 
                zero_state2 = session.run(initial_state2) #I
                node_batch = stage_batches[batch_idx][gen_idx]
                for node_idx, node in enumerate(batch):
                    children = node_batch[node_idx].children
                    for i, child in  enumerate(children):
                        zero_state1[i].c[node_idx]= state1.c[child.data[3][2]]
                        zero_state1[i].h[node_idx]= state1.h[child.data[3][2]]
                        zero_state2[i].c[node_idx]= state2.c[child.data[3][2]]
                        zero_state2[i].h[node_idx]= state2.h[child.data[3][2]]
                for child_i, (c, h) in enumerate(initial_state1):
                    feed_dict[c] = state1[child_i].c
                    feed_dict[h] = state1[child_i].h
                for child_i, (c, h) in enumerate(initial_state2):
                    feed_dict[c] = state2[child_i].c
                    feed_dict[h] = state2[child_i].h
            
            
            fetches = {'outputs1':outputs1, 'outputs2':outputs2, 'final_state1':final_state1, 'final_state2':final_state2}
            results = session.run(fetches, feed_dict)
            break
        break
    break