In [1]:
import codecs
import functools
import os
import tempfile
import zipfile

from nltk.tokenize import sexpr
import numpy as np
from six.moves import urllib
import tensorflow as tf
import ChildSumTree

  from ._conv import register_converters as _register_converters


In [2]:
data_dir = tempfile.mkdtemp()
print('saving files to %s' % data_dir)

saving files to /var/folders/_4/p7lkljg13nvbgmfj_qvgf3vm0000gn/T/tmpetkz38kw


In [3]:
def download_and_unzip(url_base, zip_name, *file_names):
  zip_path = os.path.join(data_dir, zip_name)
  url = url_base + zip_name
  print('downloading %s to %s' % (url, zip_path))
  urllib.request.urlretrieve(url, zip_path)
  out_paths = []
  with zipfile.ZipFile(zip_path, 'r') as f:
    for file_name in file_names:
      print('extracting %s' % file_name)
      out_paths.append(f.extract(file_name, path=data_dir))
  return out_paths

In [4]:
full_glove_path, = download_and_unzip(
  'http://nlp.stanford.edu/data/', 'glove.840B.300d.zip',
  'glove.840B.300d.txt')

downloading http://nlp.stanford.edu/data/glove.840B.300d.zip to /var/folders/_4/p7lkljg13nvbgmfj_qvgf3vm0000gn/T/tmpwiweseip/glove.840B.300d.zip
extracting glove.840B.300d.txt


In [3]:
filtered_glove_path = 'sick_filtered_glove2.txt'

In [4]:
def filter_glove():
    vocab = set()
    # Download the full set of unlabeled sentences separated by '|'.
    sentence_path = ['sick_train_sentenceA_tree.txt', 'sick_train_sentenceB_tree.txt',
                      'sick_trial_sentenceA_tree.txt', 'sick_trial_sentenceB_tree.txt',]
    for path in sentence_path:
        with open(path, 'r') as f:
            for line in f:
                # Drop the trailing newline and strip backslashes. Split into words.
                vocab.update(line.strip().split())
    nread = 0
    nwrote = 0
    with codecs.open(full_glove_path, encoding='utf-8') as f:
        with codecs.open(filtered_glove_path, 'w', encoding='utf-8') as out:
            for line in f:
                nread += 1
                line = line.strip()
                if not line: continue
                if line.split(u' ', 1)[0] in vocab:
                    out.write(line + '\n')
                    nwrote += 1
    print('read %s lines, wrote %s' % (nread, nwrote))
    return vocab

In [6]:
vocab = filter_glove()

NameError: name 'full_glove_path' is not defined

In [4]:
def load_embeddings(embedding_path):
    """Loads embedings, returns weight matrix and dict from words to indices."""
    print('loading word embeddings from %s' % embedding_path)
    weight_vectors = []
    word_idx = {}
    with codecs.open(embedding_path, encoding='utf-8') as f:
        for line in f:
            word, vec = line.split(u' ', 1)
            word_idx[word] = len(weight_vectors)
            weight_vectors.append(np.array(vec.split(), dtype=np.float32))
    # Annoying implementation detail; '(' and ')' are replaced by '-LRB-' and
    # '-RRB-' respectively in the parse-trees.
    if u'(' in word_idx:
        word_idx[u'-LRB-'] = word_idx.pop(u'(')
    if u')' in word_idx:
        word_idx[u'-RRB-'] = word_idx.pop(u')')
    # Random embedding vector for unknown words.
    weight_vectors.append(np.random.uniform(
      -0.05, 0.05, weight_vectors[0].shape).astype(np.float32))
    return np.stack(weight_vectors), word_idx

In [5]:
embedding_matrix, word_to_idx = load_embeddings(filtered_glove_path)
        

loading word embeddings from sick_filtered_glove2.txt


In [7]:
embedding_matrix.shape

(2336, 300)

In [6]:
hidden_size = 300
num_layers = 2
batch_size = 25
data_type = tf.float64
output_size = 21 #21 classes
reg_lambda = 1e-4 #regularization parameter
max_children = 10

In [7]:
session = tf.Session()
inputs_idx = tf.placeholder(tf.int32, shape=(batch_size, ), name="inputs_idx")
embedding = tf.constant(embedding_matrix, dtype=data_type)
input_batch = tf.nn.embedding_lookup(embedding, inputs_idx)

In [10]:
input_batch

<tf.Tensor 'embedding_lookup:0' shape=(25, 300) dtype=float64>

In [8]:
"""The model."""
import ChildSumTree as cst

proto_cell = tf.contrib.rnn.BasicLSTMCell(
          hidden_size, forget_bias=0.0, state_is_tuple=True,
          reuse=False)

cell_layer_1 = cst.ChildSumTreeLSTMCell(hidden_size)
initial_state1 = [proto_cell.zero_state(batch_size, data_type) for i in range(max_children)]
outputs1, final_state1 = cell_layer_1(input_batch, state=initial_state1, scope='Layer1')
cell_layer_2 = cst.ChildSumTreeLSTMCell(hidden_size)
initial_state2 = [proto_cell.zero_state(batch_size, data_type) for i in range(max_children)]
outputs2, final_state2 = cell_layer_2(outputs1, state=initial_state2, scope='Layer2')

In [13]:
outputs2

<tf.Tensor 'Layer2/Mul_11:0' shape=(25, 300) dtype=float64>

In [10]:
p_hat

<tf.Tensor 'Softmax:0' shape=(?, 21) dtype=float64>

In [9]:
h_abs_difference = tf.placeholder(data_type, name="h_abs_difference")
h_elewise_product = tf.placeholder(data_type, name="h_elewise_product")

W_h_abs_difference = tf.get_variable("W_h_abs_difference", [hidden_size, output_size], data_type)
W_h_elewise_product = tf.get_variable("W_h_elewise_product", [hidden_size, output_size], data_type)
B_h = tf.get_variable("B_h", [output_size], data_type)
h_s = tf.nn.xw_plus_b(h_abs_difference, W_h_abs_difference, B_h)
h_s = tf.add(tf.matmul(h_elewise_product, W_h_elewise_product), h_s)
h_s = tf.nn.sigmoid(h_s)

W_p = tf.get_variable("W_p", [output_size, output_size], data_type) #TODO what's the correct shape??
B_p = tf.get_variable("B_p", [output_size], data_type)
p_hat = tf.nn.softmax(tf.nn.xw_plus_b(h_s, W_p, B_p))

#r = np.linspace(1.0,5.0,21) #1.0, 1.1, 1.2 ... 4.9, 5.0
#y_hat = tf.matmul(r, p_hat)


ValueError: Shape must be rank 2 but is rank 1 for 'MatMul_1' (op: 'MatMul') with input shapes: [41], [?,21].

In [6]:
"""Tree node class"""
class Node(object):
    def __init__(self, data, parent=None):
        self.data = data
        self.children = []
        self.parent = parent

    def add_child(self, obj):
        self.children.append(obj)
        
    def add_parent(self, obj):
        self.parent = obj
        
    def __str__(self, tabs=0):
        #set_trace()
        tab_spaces = str.join("", [" " for i in range(tabs)])
        return tab_spaces + "+-- Node: "+ str.join("|", self.data) + "\n"\
                + str.join("\n", [child.__str__(tabs+2) for child in self.children])

In [15]:
"""Preparing inputs"""
#Parse SyntaxtNet output to sentence trees 

def parse_dep_tree_text(file_name='sick_train_sentenceA_tree.txt'):
    all_data=[]
    sentence_trees = []
    with open(file_name, 'r') as f:
        line = "placeholder"
        while not (line.strip() == ""):
            line = f.readline()
            #set_trace()
            if line.startswith("Input:") or line.startswith("Parse:"):
                continue
            elif "ROOT" in line and (line.index("ROOT") is len(line)-5):
                root_tokens = line.split()
                current_node = Node(root_tokens)
                sentence_trees.append(current_node)
                spaces = 0
                node_stack = []
                #set_trace()
                while not line.startswith("Input:"): 
                    line = f.readline()
                    if line.startswith("Input:") or line.startswith("Parse:"):
                        break
                    elif  line.strip() == "":
                        break
                    else:
                        #set_trace()
                        if line.index("+--") < spaces:
                            while line.index("+--") < spaces:
                                current_node, spaces = node_stack.pop()

                        if line.index("+--") > spaces:
                            line_copy = line
                            line_copy = line_copy.replace("|", "")
                            line_copy = line_copy.replace("+--", "")
                            tokens = line_copy.split()
                            new_node = Node(tokens, parent=current_node)
                            all_data.append(tokens)
                            current_node.add_child(new_node)
                            node_stack.append((current_node, spaces))
                            current_node = new_node
                            spaces = line.index("+--")

                        elif line.index("+--") == spaces:
                            line_copy = line
                            line_copy = line_copy.replace("|", "")
                            line_copy = line_copy.replace("+--", "")
                            tokens = line_copy.split()
                            all_data.append(tokens)
                            new_node = Node(tokens, parent=node_stack[-1][0])
                            node_stack[-1][0].add_child(new_node)
                            current_node = new_node
                            spaces = line.index("+--")
    return sentence_trees #a list of the roots nodes

In [8]:
"""split the tree nodes into batches and stages/generations of children"""
#TODO Shuffle the tree lists and the scores
def create_batches(trees, tree_batch_size = 25):
    tree_batches = []
    stage_batches = []
    sentence_trees = np.array(trees)
    for i in range(len(sentence_trees)//tree_batch_size):
        tree_batch = sentence_trees[i*tree_batch_size:(i+1)*tree_batch_size] 
        tree_batches.append(tree_batch)
        stage_batch = []
        stage_batches.append(stage_batch)
        for tree in tree_batch:
            nodes = [tree]
            generations = []
            while len(nodes) > 0:
                generations.append([]+nodes)
                next_nodes=[]
                for node in nodes:
                    next_nodes.extend(node.children)
                nodes = next_nodes
            current_stage = 0
            for generation in reversed(generations):
                if len(stage_batch)<current_stage+1:
                    stage_batch.append(generation)
                else:
                    stage_batch[current_stage].extend(generation)
                current_stage += 1
    return stage_batches, tree_batches

In [38]:
#TODO each node must know it's location 

def pad_batches(stage_batches, tree_batches, generation_size):
    for sb_idx, stage_batch in enumerate(stage_batches):
        for g_idx, generation in enumerate(stage_batch):
            while (not len(generation)==0) and len(generation) < generation_size:
                generation.append(Node([], parent=None))
                

In [32]:
"""Function to load the target scores and split them into batches"""
from IPython.core.debugger import set_trace

def load_scores(file_name, batch_size):
    score_batches = []
    with open(file_name, 'r') as f:
        batch = []
        for line in f:
            if line and float(line):
                batch.append(float(line))
                
            if len(batch)== batch_size: 
                score_batches.append(batch)
                batch = []
    return score_batches
            

In [33]:
len(load_scores('sick_train_score.txt', 25))

180

In [12]:
"""Convert words to ids
 Takes batches of tree nodes arranged in stages/generations
 returns the same data structure as the input except instead of nodes there are ids"""
def covert_words_ids(stage_batches, word_to_idx, embedding_matrix):    
    stage_batches_ids = []
    for batch_idx, stage_batch in enumerate(stage_batches):
        stage_batch_ids = []
        stage_batches_ids.append(stage_batch_ids)
        for gen_idx, generation in enumerate(stage_batch):
            generation_ids = []
            stage_batch_ids.append(generation_ids)
            for node_idx, node in enumerate(generation):
                word = node.data[0]
                node.data.append((batch_idx, gen_idx, node_idx))
                if word in word_to_idx:
                    node_word_id = word_to_idx[word]
                    generation_ids.append(node_word_id)
                else:
                    generation_ids.append(len(embedding_matrix)-1)
                    print("unknown: "+word)
    return stage_batches_ids
            

unstitching
uninterestedly
midspeech


In [None]:
"""takes the target score array and converts them to probabilities """
def convert_scores_to_p(scores, num_of_classes=5):
    p = np.zeros((len(scores), num_of_classes*(num_of_classes-1)+1))
    for i, score in enumerate(scores):        
        sim = score * (num_of_classes - 1) + 1
        floor = math.floor(sim) 
        ceil = math.ceil(sim) 
        if math.ceil(sim) == math.floor(sim):
            p[i][ceil] = 1
        else:
            p[i][floor] = ceil - sim
            p[i][ceil] = sim - floor
    return p

In [10]:
"""Feed the data in, run the session and prepare the final state for the next generation 
   Repeat until every root in the batch has an output
   Calculate the loss and optimize """
epochs = 1000
writer = tf.summary.FileWriter("./", session.graph)
state = session.run(initial_state, {batch_size:})
for i in range(epochs): 
    for i, batch in enumerate(stage_batches_ids): 
        for j, generation in enumerate(batch):
            """Feed generation in and get output and states"""
            feed_dict = {inputs:, targets:}
            if j == 0: 
                """Use zerostate"""
                state = session.run(initial_state, {batch_size:len(generation)})
                for i, (c, h) in enumerate(initial_state):
                    feed_dict[c] = state[i][c]
                    feed_dict[h] = state[i][h]
            else: 
                """generate the state from the children of the nodes in the next bactch"""
                for i, (c, h) in enumerate(initial_state):
                    
            fetches = {outputs, final_state}

(2336, 300)