In [1]:
import xml.etree.ElementTree as ET
import numpy as np
from copy import deepcopy
from pprint import pprint
from keras.preprocessing import sequence as seq
import glob

  from ._conv import register_converters as _register_converters
Using Theano backend.


In [2]:
def number_of_trees_in_forest(forest):

    """
    Funkcja zwraca liczbe drzew w lesie forest.
    
    forest - las drzew [xml.etree.ElementTree.ElementTree]
    """
    
    _check_sentence(forest,"forest")
    
    return int(forest.find("stats").attrib["trees"])
    

In [3]:
def get_node(tree, node_id):
    return(tree.find(".//node[@nid='" + str(node_id) + "']"))

In [4]:
def is_ambigous(node):
    if len(node.findall("children"))>1:
        return(True)
    else:
        return(False)

In [5]:
def is_terminal(node):
    if node.find("terminal"):
        return(True)
    else:
        return(False)    

In [6]:
def transform_abmigous_node_to_disjunctive_nodes(node, root, max_id_of_conjunctive_nodes):
 
    """
    Funkcja modyfikuje node przez referencje!
    """

    number_of_disjunctive_nodes = 0

    childrens = node.findall("children")

    disjunctive_children = ET.Element('children')

    node.set("type","conjunctive_node_with_disjunctive_children")
    
    for children in list(childrens):

        new_disjunctive_node = ET.SubElement(root, 'node')
        new_disjunctive_node.set("type", "disjunctive")
        number_of_disjunctive_nodes += 1

        new_id = str(max_id_of_conjunctive_nodes+number_of_disjunctive_nodes)
        new_disjunctive_node.set("nid",new_id)

        disjuntive_child = ET.SubElement(disjunctive_children, 'child')
        disjuntive_child.set("nid",new_id)

        new_disjunctive_node.set("chosen",children.attrib.get("chosen","false"))
        category = ET.SubElement(new_disjunctive_node, 'nonterminal')
        category = ET.SubElement(category, 'category')
        category.text = "disjunctive_node"
        new_disjunctive_node.append(deepcopy(children))

        node.remove(children)

    node.append(disjunctive_children)

In [7]:
def get_maximum_node_id(tree):
    return(np.max([int(node.attrib["nid"]) for node in tree.findall("node")]))

In [8]:
def transform_to_graph_with_con_and_dis_nodes(tree):
    
    max_id_of_conjunctive_nodes = get_maximum_node_id(tree)
    root = tree.getroot()

    for node_id in range(max_id_of_conjunctive_nodes+1):

        node = get_node(tree, node_id)
        node.set("type", "terminal" if is_terminal(node) else "conjunctive")

        if is_ambigous(node):

            transform_abmigous_node_to_disjunctive_nodes(node, root, get_maximum_node_id(tree))


In [9]:
tree = ET.parse('../Składnica-frazowa-171220/NKJP_1M_0402000001/morph_3-p/morph_3.9-s.xml')
#ET.dump(tree)

In [10]:
tree = ET.parse("../Składnica-frazowa-171220/NKJP_1M_2002000137/morph_3-p/morph_3.36-s.xml")
#ET.dump(tree)

In [11]:
transform_to_graph_with_con_and_dis_nodes(tree)
#ET.dump(tree)

In [12]:
def terminals(tree):

    terminal_nodes = [x for x in tree.findall("node[terminal]")]

    terminals = [[(x.attrib["nid"],
                   x.find("terminal//orth").text.replace(" ", ""),  # zdarzaja sie przypadki ze token zawiera w sobie spacje i potem wyglada to jakby bylo wiecej tokenow i sie dlugosc nie zgadza
                   x.find("terminal//base").text, 
                   x.find("terminal//f").text)]  for x in terminal_nodes]

    ids = [x[0][0] for x in terminals]

    return terminals, ids 


In [13]:
def get_ids(tree):
    
    _ , ids = terminals(tree) 
    
    while "0" not in ids:

        for nid in ids:
            t = time.time()
            parents = tree.findall(".//children/child[@nid='"+str(nid)+"']....")

            for parent in parents:
                childs = parent.findall("children/child")
                childs_ids = [child.attrib["nid"] for child in childs]
                if np.all([child_id in ids for child_id in childs_ids]) and parent.attrib["nid"] not in ids:
                    ids.append(parent.attrib["nid"])

    return(ids)

In [14]:
def type_of_node(node):
    return(node.attrib["type"])

In [15]:
def token(terminal):
    assert is_terminal(terminal)
    return(terminal.find("terminal//orth").text.replace(" ", "")) # zdarzaja sie sytuacje, ze w tokenie jest spacja co psuje strukture

In [16]:
def get_head(tree, node_id):
    
    node = get_node(tree, node_id)
    
    if type_of_node(node) == "terminal":
        return(token(node))
    

    if type_of_node(node) == "disjunctive":      
        try:
            head_child_id = node.find("children/child[@head='true']").attrib["nid"]
            return(get_head(tree, head_child_id))
        except:
            return("__head_unknown__")
    
    
    children = node.findall("children")
    childs = node.findall("children/child")
    if len(children)==1 and type_of_node(get_node(tree,childs[0].attrib["nid"])) != "disjunctive": # wierzcholek jest koniunktywny i ma dzieci koninktywne
        try:
            head_child_id = node.find("children/child[@head='true']").attrib["nid"]
            return(get_head(tree, head_child_id))
        except:
            return("__head_unknown__")
    
        
    else: #mamy wierzcholek koniunktywny, ktorego dzieci sa dysjunktywne
        
        child_ids = [child.attrib["nid"] for child in node.findall("children/child")]
        heads = [get_head(tree, child_id) for child_id in child_ids]
        if len(set(heads))==1: #wszystkie opcje maja taka sama glowe
            return(heads[0])# to glowa wierzcholka dysjunktywnego jest wyznaczona, bo niezalezo od opcji
        else:
            return("__node_with_undetermined_head__")
        #MOZNA TEZ ROZWAZYC CZY NIE POWINNA BYC TO SREDNIA Z EMBEDDINGOW MOZLIWYCH HEAD'OW
    

In [17]:
def get_children_rule(tree, node):
    
    if is_terminal(node):
        return("__terminal__")
    
    
    children = node.find("children")
    
    if "rule" in children.attrib.keys():
        return(children.attrib["rule"])
    else:
        children_rules = [get_children_rule(tree,get_node(tree,child.attrib["nid"])) for child in children.findall("child")]
        if len(set(children_rules))==1:
            return(children_rules[0])
        else:
            return("__node_with_undetermined_children_rule__")

In [18]:
def get_info(tree, nid):
    
    node = get_node(tree, nid)
    
    if type_of_node(node) == "terminal":
        
        infos = [get_children_rule(tree, node),
                 node.find("terminal//base").text, 
                 node.find("terminal//f").text]
    else:
        
        
        children_rule = get_children_rule(tree, node)
        
        infos = [x.text for x in node.find("nonterminal").getchildren()]
        categories = infos[0]
        attributes = ":".join(infos[1:]) if len(infos)>1 else "None"
        infos = [children_rule, categories, attributes]
        
        # JESLI NA ZBIORZE TESTOWYM POJAWIA SIE KOMBINACJA ATRYBUTOW, KTOREJ NIE BYLO W ZBIORZE TRENINGOWYM, 
        # TO PRZYPISAC W JEJ MIEJSCE NAJBARDZIEJ PODOBNA 
        # - TYLKO TRZEBA MERYTORYCZNIE WLASCIWIE OKRESLIC PODOBIENSTWO
        
    return(infos)

In [19]:
def get_children_positions_in_graph(tree, node_id, ids):
    
    node = get_node(tree,node_id)
    
    if is_terminal(node):
        return([-1])
    
    else:
        
        children_ids = [child.attrib["nid"] for child in node.findall("children/child")]
        children_positions = [ids.index(child_id) for child_id in children_ids]
        return(children_positions)
    

In [20]:
def is_chosen(node):
    if node.attrib["chosen"]=="true":
        return(1)
    else:
        return(0)

In [21]:
def get_labels(tree):
    labels = [-1 if type_of_node(get_node(tree,node_id)) != "disjunctive" else is_chosen(get_node(tree,node_id)) for node_id in get_ids(tree)]
    return(labels)
    

In [22]:
def get_representation(tree, words2ids):
    
    ids = get_ids(tree)
    children_matrix = seq.pad_sequences([get_children_positions_in_graph(tree,x,ids) for x in ids],value=-1, padding='post')
    labels = get_labels(tree)
    heads = [words2ids.get(get_head(tree,x),-1) for x in ids]

    types = [int(type_of_node(get_node(tree,i)) == "disjunctive") for i in ids]
    return([heads, 
             children_matrix,
             labels,
             list(range(len(children_matrix))),
           types])

In [25]:
import numpy as np
import random
import time
import os
import itertools
import pickle
import  csv
from collections import Counter, OrderedDict

from keras.preprocessing import sequence as seq

import os    
os.environ['THEANO_FLAGS'] = "optimizer = 'None'"

import theano
from theano import tensor as T
from theano.ifelse import ifelse
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 



s = {'lr':0.05,
         'nepochs':80,
         'seed':345,
         'nc':2,        # number of y classes
         'h_dim': 55,
         'h_dropout_rate': 0.5,
         'emb_dropout_rate': 0.5,
         'time_without_improvement': 10,
         'batch_size': 1,
         'w2v_DIM': "300",
         "rules_emb_dim": 0
         }  

dataType = 'int64'
  
np.random.seed(s['seed']) 

#ile_with_filtered_embeddings = "embeddings/filtered_nkjp+wiki-forms-restricted-300-cbow-ns.pkl"
#2vecs = pickle.load(open(file_with_filtered_embeddings,"rb"))

rnn = TreeLSTM( h_dim = s['h_dim'],
            nc = s['nc'],
        w2v_model_path = "embeddings/filtered_train_and_test_w2v_allwiki_nkjpfull_300.pkl",
            max_phrase_length = 1000,
        emb_dropout_rate = s['emb_dropout_rate'],
        h_dropout_rate = s['h_dropout_rate'],
        l = 0.0001,
        srng = RandomStreams(12345),
        file_with_rules =  "/home/norbert/Doktorat/SyntacticTreesDisambiguation/Składnica_preprocessed_training_data/rules.txt",
        rules_emb_dim = s["rules_emb_dim"],
        load_params= False#"/home/norbert/Doktorat/SyntacticTreesDisambiguation/Model/model_params_116.pkl"
    )




In [26]:
data_folder = "/home/norbert/Doktorat/SyntacticTreesDisambiguation/Składnica_raw_data/Train/*.xml"
files = glob.glob(data_folder,recursive=True)
data = []
for i, file in enumerate(files[:20]):
    if i % 10 == 0:
        print(i)
    forest = ET.parse(file)
    if number_of_trees_in_forest(forest) < 100000:
        transform_to_graph_with_con_and_dis_nodes(forest)
        data.append(get_representation(forest,rnn.words2ids))

0
10


In [32]:
class TreeLSTM(object):  

    def __init__(self, h_dim, nc, w2v_model_path, file_with_rules, 
                 rules_emb_dim, max_phrase_length, emb_dropout_rate, h_dropout_rate, l, srng,
                load_params=None): 

        '''

        - dropout stanu ukrytego (LSTM_1)
        - dropout embeddinga (LSTM_1)
        - regularyzacja l2 (LSTM_1)
        - indywidualna obsluga lisci - struktura taka sama, macierze te same, ale uczymy: h_aggregated_0, hidden_state_0, cell_state_0, zamiast brac w te miejsca 0


        nh :: dimension of hidden state
        nc :: number of classes
        '''

        self.max_phrase_length = max_phrase_length

        w2vecs = pickle.load(open(w2v_model_path,"rb"))
        self.emb = theano.shared(w2vecs["vectors"].astype(theano.config.floatX))
        self.words2ids = w2vecs["words2ids"]

        emb_dim = w2vecs["vectors"].shape[1]
        del w2vecs

        
        r = open(file_with_rules,"r")
        rules = [x.split() for x in r.readlines()]
        r.close()
        unique_rules = set()
        for i in range(len(rules)):
            for j in range(len(rules[i])):
                unique_rules.add(rules[i][j])
                
        number_of_uniue_rules = len(unique_rules)
 
        r = 0.05

        self.rules2ids = dict(zip(unique_rules,range(number_of_uniue_rules)))
        self.emb_rules = theano.shared(r * np.random.uniform(-1,1,(number_of_uniue_rules, rules_emb_dim)).astype(theano.config.floatX))
        
   

        self.W_i = theano.shared(r * np.random.uniform(-1.0, 1.0, (emb_dim+rules_emb_dim, h_dim) ).astype(theano.config.floatX))
        self.U_i = theano.shared(r * np.random.uniform(-1.0, 1.0, (h_dim, h_dim) ).astype(theano.config.floatX))
        self.b_i = theano.shared(r * np.random.uniform(-1.0, 1.0, h_dim ).astype(theano.config.floatX))

        self.W_f = theano.shared(r * np.random.uniform(-1.0, 1.0, (emb_dim+rules_emb_dim, h_dim) ).astype(theano.config.floatX))
        self.U_f = theano.shared(r * np.random.uniform(-1.0, 1.0, (h_dim, h_dim) ).astype(theano.config.floatX))
        self.b_f = theano.shared(r * np.random.uniform(-1.0, 1.0, h_dim ).astype(theano.config.floatX))
        
        self.W_o = theano.shared(r * np.random.uniform(-1.0, 1.0, (emb_dim+rules_emb_dim, h_dim) ).astype(theano.config.floatX))
        self.U_o = theano.shared(r * np.random.uniform(-1.0, 1.0, (h_dim, h_dim) ).astype(theano.config.floatX))
        self.b_o = theano.shared(r * np.random.uniform(-1.0, 1.0, h_dim ).astype(theano.config.floatX))

        self.W_u = theano.shared(r * np.random.uniform(-1.0, 1.0, (emb_dim+rules_emb_dim, h_dim) ).astype(theano.config.floatX))
        self.U_u = theano.shared(r * np.random.uniform(-1.0, 1.0, (h_dim, h_dim) ).astype(theano.config.floatX))
        self.b_u = theano.shared(r * np.random.uniform(-1.0, 1.0, h_dim ).astype(theano.config.floatX))

        self.W_y   = theano.shared(r * np.random.uniform(-1.0, 1.0, (h_dim, nc)).astype(theano.config.floatX))
        self.b_y   = theano.shared(r * np.random.uniform(-1.0, 1.0, nc).astype(theano.config.floatX))



        self.W_i_dis = theano.shared(r * np.random.uniform(-1.0, 1.0, (emb_dim+rules_emb_dim, h_dim) ).astype(theano.config.floatX))
        self.U_i_dis = theano.shared(r * np.random.uniform(-1.0, 1.0, (h_dim, h_dim) ).astype(theano.config.floatX))
        self.b_i_dis = theano.shared(r * np.random.uniform(-1.0, 1.0, h_dim ).astype(theano.config.floatX))

        self.W_f_dis = theano.shared(r * np.random.uniform(-1.0, 1.0, (emb_dim+rules_emb_dim, h_dim) ).astype(theano.config.floatX))
        self.U_f_dis = theano.shared(r * np.random.uniform(-1.0, 1.0, (h_dim, h_dim) ).astype(theano.config.floatX))
        self.b_f_dis = theano.shared(r * np.random.uniform(-1.0, 1.0, h_dim ).astype(theano.config.floatX))
        
        self.W_o_dis = theano.shared(r * np.random.uniform(-1.0, 1.0, (emb_dim+rules_emb_dim, h_dim) ).astype(theano.config.floatX))
        self.U_o_dis = theano.shared(r * np.random.uniform(-1.0, 1.0, (h_dim, h_dim) ).astype(theano.config.floatX))
        self.b_o_dis = theano.shared(r * np.random.uniform(-1.0, 1.0, h_dim ).astype(theano.config.floatX))

        self.W_u_dis = theano.shared(r * np.random.uniform(-1.0, 1.0, (emb_dim+rules_emb_dim, h_dim) ).astype(theano.config.floatX))
        self.U_u_dis = theano.shared(r * np.random.uniform(-1.0, 1.0, (h_dim, h_dim) ).astype(theano.config.floatX))
        self.b_u_dis = theano.shared(r * np.random.uniform(-1.0, 1.0, h_dim ).astype(theano.config.floatX))

        
        


        self.h_aggregated_0 = theano.shared(r * np.random.uniform(-1.0, 1.0, h_dim ).astype(theano.config.floatX))
        self.cell_state_0 = theano.shared(r * np.random.uniform(-1.0, 1.0, h_dim ).astype(theano.config.floatX))
        self.hidden_state_0 = theano.shared(r * np.random.uniform(-1.0, 1.0, h_dim ).astype(theano.config.floatX))



        self.srng = srng
        self.h_dropout_rate = h_dropout_rate
        self.emb_dropout_rate = emb_dropout_rate
        self.l = l


        if load_params:
            load_params = pickle.load(open(load_params,"rb"))
            if type(load_params)==list:
                load_params = dict(load_params)
            for key in load_params.keys():
                if key not in ['emb', 'emb_rules', 'W_i', 'U_i', 'b_i', 'W_f', 'U_f', 'b_f', 'W_o', 'U_o', 'b_o', 'W_u', 'U_u', 'b_u', 'W_y', 'b_y', 'h_aggregated_0', 'cell_state_0', 'hidden_state_0']:
                    setattr(self, key, load_params[key])
                else:
                    setattr(self, key, theano.shared(load_params[key]))
        
        

        def one_step(word_id, rule_id, word_children_positions, y_true, k, node_type, hidden_states, cell_states, learning_rate):

            x = self.emb[word_id]#T.concatenate( [self.emb[word_id], self.emb_rules[rule_id] ])

            #dropout:
            mask1 = self.srng.binomial(n=1, p=1-self.emb_dropout_rate, size=(emb_dim+rules_emb_dim,), dtype='floatX')
            x = x * mask1


            tmp = word_children_positions>=0.0
            number_of_children = tmp.sum(dtype = theano.config.floatX) 
                  
            idx_tmp = tmp.nonzero()                                    
      
            h_aggregated = ifelse(T.gt(number_of_children, 0.0), 
                                  ifelse(T.eq(node_type,1),
                                           hidden_states[word_children_positions[idx_tmp]].mean(axis=0),
                                           hidden_states[word_children_positions[idx_tmp]].sum(axis=0)), 
                                  self.h_aggregated_0)
            h_aggregated_printed = theano.printing.Print('h_aggregated')(h_aggregated)

            node_type_printed = theano.printing.Print('node')(node_type)
            
            i = ifelse(T.eq(node_type_printed,1),
                         T.nnet.sigmoid(	T.dot(x, self.W_i_dis) + T.dot(h_aggregated_printed, self.U_i_dis) + self.b_i_dis),
                         T.nnet.sigmoid(	T.dot(x, self.W_i) + T.dot(h_aggregated_printed, self.U_i) + self.b_i))             

            o = ifelse(T.eq(node_type,1),
                         T.nnet.sigmoid(	T.dot(x, self.W_o_dis) + T.dot(h_aggregated, self.U_o_dis) + self.b_o_dis),
                         T.nnet.sigmoid(	T.dot(x, self.W_o) + T.dot(h_aggregated, self.U_o) + self.b_o))             

            u = ifelse(T.eq(node_type,1),
                         T.tanh(	T.dot(x, self.W_u_dis) + T.dot(h_aggregated, self.U_u_dis) + self.b_u_dis),
                         T.tanh(	T.dot(x, self.W_u) + T.dot(h_aggregated, self.U_u) + self.b_u))            

            f_c = ifelse(T.gt(number_of_children, 0.0), 
                 ifelse(T.eq(node_type,1),
                (T.nnet.sigmoid( T.dot(x, self.W_f_dis ) + T.dot(hidden_states[word_children_positions[idx_tmp]], self.U_f_dis)  + self.b_f_dis )*cell_states[word_children_positions[idx_tmp]]).sum(axis=0),          
                (T.nnet.sigmoid( T.dot(x, self.W_f ) + T.dot(hidden_states[word_children_positions[idx_tmp]], self.U_f)  + self.b_f )*cell_states[word_children_positions[idx_tmp]]).sum(axis=0)),
                T.nnet.sigmoid( T.dot(x, self.W_f ) + T.dot(self.hidden_state_0, self.U_f)  + self.b_f ) * self.cell_state_0
            )

            f_c_printed = theano.printing.Print('F_C')(f_c)
            
            c = i*u + f_c_printed

            h = o * T.tanh(c)
            h_printed = theano.printing.Print('H:')(h)
            
            #dropout:
            mask = self.srng.binomial(n=1, p=1-self.h_dropout_rate, size=(h_dim,), dtype='floatX')
            h = h_printed * mask

            k_printed = theano.printing.Print('K:')(k)
            
            current_cell_state = cell_states[k_printed]
            cell_states_new = T.set_subtensor(current_cell_state, c)

            current_hidden_state = hidden_states[k_printed]
            hidden_states_new = T.set_subtensor(current_hidden_state, h)


            y_prob = T.nnet.softmax(T.dot(h,self.W_y) + self.b_y)[0]

            cross_entropy = ifelse(T.eq(node_type,1), -T.log(y_prob[y_true]), 0.0)						      

            return cross_entropy, hidden_states_new, cell_states_new  


        y = T.vector('y',dtype=dataType)
        learning_rate = T.scalar('lr',dtype=theano.config.floatX)
        words = T.vector(dtype=dataType)
        rules = T.vector(dtype=dataType)
        children_positions = T.matrix(dtype=dataType)
        words_indexes = T.vector(dtype=dataType)
        node_types = T.vector(dtype=dataType)

        [cross_entropy_vector, _, _] , _ = theano.scan(fn=one_step, \
                                 sequences = [words, rules, children_positions, y, words_indexes, node_types],
                                 outputs_info = [None,
                                                 T.zeros((T.shape(words)[0],h_dim), dtype = theano.config.floatX),
                                                 T.zeros((T.shape(words)[0],h_dim), dtype = theano.config.floatX)],
                                 non_sequences = learning_rate)#,
                                 #n_steps = words.shape[0])

        cross_entropy_vector_printed = theano.printing.Print('cross_entropy_vector:')(cross_entropy_vector) 
            
        cost = T.mean(cross_entropy_vector_printed) #+ self.l * (self.emb_rules**2).sum() 
        
        updates = OrderedDict([
            (self.W_i, self.W_i-learning_rate*T.grad(cost, self.W_i)),
            (self.W_f, self.W_f-learning_rate*T.grad(cost, self.W_f)),
            (self.W_o, self.W_o-learning_rate*T.grad(cost, self.W_o)),
            (self.W_u, self.W_u-learning_rate*T.grad(cost, self.W_u)),
            (self.W_y, self.W_y-learning_rate*T.grad(cost, self.W_y)),

            (self.U_i, self.U_i-learning_rate*T.grad(cost, self.U_i)),
            (self.U_f, self.U_f-learning_rate*T.grad(cost, self.U_f)),
            (self.U_o, self.U_o-learning_rate*T.grad(cost, self.U_o)),
            (self.U_u, self.U_u-learning_rate*T.grad(cost, self.U_u)),

            #(self.emb, self.emb-learning_rate*T.grad(cost, self.emb)), #SPROBOWAC TU 0.1 ZAMIAST LR, A DLA POLSKICH BEZ AKTUALIZACJI EMB
            #(self.emb_rules, self.emb_rules-learning_rate*T.grad(cost, self.emb_rules)),
            (self.b_i, self.b_i-learning_rate*T.grad(cost,self.b_i)),
                        (self.b_f, self.b_f-learning_rate*T.grad(cost,self.b_f)),
                        (self.b_o, self.b_o-learning_rate*T.grad(cost,self.b_o)),
                        (self.b_u, self.b_u-learning_rate*T.grad(cost,self.b_u)),
                        (self.b_y, self.b_y-learning_rate*T.grad(cost,self.b_y)),

            (self.W_i_dis, self.W_i_dis-learning_rate*T.grad(cost, self.W_i_dis)),
            (self.W_f_dis, self.W_f_dis-learning_rate*T.grad(cost, self.W_f_dis)),
            (self.W_o_dis, self.W_o_dis-learning_rate*T.grad(cost, self.W_o_dis)),
            (self.W_u_dis, self.W_u_dis-learning_rate*T.grad(cost, self.W_u_dis)),

            (self.U_i_dis, self.U_i_dis-learning_rate*T.grad(cost, self.U_i_dis)),
            (self.U_f_dis, self.U_f_dis-learning_rate*T.grad(cost, self.U_f_dis)),
            (self.U_o_dis, self.U_o_dis-learning_rate*T.grad(cost, self.U_o_dis)),
            (self.U_u_dis, self.U_u_dis-learning_rate*T.grad(cost, self.U_u_dis)),

            #(self.emb, self.emb-learning_rate*T.grad(cost, self.emb)), #SPROBOWAC TU 0.1 ZAMIAST LR, A DLA POLSKICH BEZ AKTUALIZACJI EMB
            #(self.emb_rules, self.emb_rules-learning_rate*T.grad(cost, self.emb_rules)),
            (self.b_i_dis, self.b_i_dis-learning_rate*T.grad(cost,self.b_i_dis)),
                        (self.b_f_dis, self.b_f_dis-learning_rate*T.grad(cost,self.b_f_dis)),
                        (self.b_o_dis, self.b_o_dis-learning_rate*T.grad(cost,self.b_o_dis)),
                        (self.b_u_dis, self.b_u_dis-learning_rate*T.grad(cost,self.b_u_dis)),            
            
            
            
            (self.h_aggregated_0, self.h_aggregated_0-learning_rate*T.grad(cost,self.h_aggregated_0)),
            (self.cell_state_0, self.cell_state_0-learning_rate*T.grad(cost,self.cell_state_0)),
            (self.hidden_state_0, self.hidden_state_0-learning_rate*T.grad(cost,self.hidden_state_0))

            ])

        self.train = theano.function( inputs  = [words, rules, children_positions, y, words_indexes, node_types, learning_rate],
                                      outputs = [],
                                      updates = updates,
                                      allow_input_downcast=True,
                                      mode='FAST_RUN'
                                      )


        def one_step_classify(word_id, rule_id, word_children_positions, k, node_type, hidden_states, cell_states):

            x = self.emb[word_id] #T.concatenate( [self.emb[word_id], self.emb_rules[rule_id] ])

            x = (1-self.emb_dropout_rate) * x

            tmp = word_children_positions>=0.0
            number_of_children = tmp.sum(dtype = theano.config.floatX) 
            idx_tmp = tmp.nonzero()                                                                   # indeksy realne dzieci - czyli te, gdzie nie ma -1        

            h_aggregated = ifelse(T.gt(number_of_children, 0.0), 
                                  ifelse(T.eq(node_type,1),
                                           hidden_states[word_children_positions[idx_tmp]].mean(axis=0),
                                           hidden_states[word_children_positions[idx_tmp]].sum(axis=0)), 
                                  self.h_aggregated_0)


            i = ifelse(T.eq(node_type,1),
                         T.nnet.sigmoid(	T.dot(x, self.W_i_dis) + T.dot(h_aggregated, self.U_i_dis) + self.b_i_dis),
                         T.nnet.sigmoid(	T.dot(x, self.W_i) + T.dot(h_aggregated, self.U_i) + self.b_i))             

            o = ifelse(T.eq(node_type,1),
                         T.nnet.sigmoid(	T.dot(x, self.W_o_dis) + T.dot(h_aggregated, self.U_o_dis) + self.b_o_dis),
                         T.nnet.sigmoid(	T.dot(x, self.W_o) + T.dot(h_aggregated, self.U_o) + self.b_o))             

            u = ifelse(T.eq(node_type,1),
                         T.tanh(	T.dot(x, self.W_u_dis) + T.dot(h_aggregated, self.U_u_dis) + self.b_u_dis),
                         T.tanh(	T.dot(x, self.W_u) + T.dot(h_aggregated, self.U_u) + self.b_u))            

            f_c = ifelse(T.gt(number_of_children, 0.0), 
                 ifelse(T.eq(node_type,1),
                    (T.nnet.sigmoid( T.dot(x, self.W_f_dis ) + T.dot(hidden_states[word_children_positions[idx_tmp]], self.U_f_dis)  + self.b_f_dis )*cell_states[word_children_positions[idx_tmp]]).sum(axis=0),          
                    (T.nnet.sigmoid( T.dot(x, self.W_f ) + T.dot(hidden_states[word_children_positions[idx_tmp]], self.U_f)  + self.b_f )*cell_states[word_children_positions[idx_tmp]]).sum(axis=0)),
                T.nnet.sigmoid( T.dot(x, self.W_f ) + T.dot(self.hidden_state_0, self.U_f)  + self.b_f ) * self.cell_state_0
            )

            c = i*u + f_c

            h = o * T.tanh(c)
            # podczas uczenia zerowalismy 1-dropout_rate procent wspolrzednych, wiec trzeba to 
            h = h * (1-self.h_dropout_rate)

            current_cell_state = cell_states[k]
            cell_states_new = T.set_subtensor(current_cell_state, c)

            current_hidden_state = hidden_states[k]
            hidden_states_new = T.set_subtensor(current_hidden_state, h)


            y_prob = T.nnet.softmax(T.dot(h,self.W_y) + self.b_y)[0]             

            return  y_prob, hidden_states_new, cell_states_new


        [y_probs_classify, _ , _ ], _ = theano.scan(
                 fn=one_step_classify, 
                                 sequences = [words, rules, children_positions, words_indexes, node_types],
                 outputs_info = [None,
                                 T.zeros((T.shape(words)[0],h_dim), dtype = theano.config.floatX),
                                 T.zeros((T.shape(words)[0],h_dim), dtype = theano.config.floatX)])

        predictions, _ = theano.scan(lambda i: T.argmax(y_probs_classify[i]), 
                                     sequences = [words_indexes])
        
        probs, _ = theano.scan(lambda i: y_probs_classify[i], 
                                     sequences = [words_indexes])

        self.classify = theano.function(inputs=[words, rules, children_positions, words_indexes, node_types], 
                                     outputs=predictions,
                                     allow_input_downcast=True,
                                     mode='FAST_RUN' 
                                     )

        self.predict_proba = theano.function(inputs=[words, rules, children_positions,words_indexes, node_types], 
                             outputs=probs,
                             allow_input_downcast=True,
                             mode='FAST_RUN' 
                             )

        self.calculate_loss = theano.function(inputs=[words, rules, children_positions, y, words_indexes, node_types, learning_rate], 
                     outputs=cost,
                     allow_input_downcast=True,
                     mode='FAST_RUN' 
                     )
        
    def save_model(self,path):
        params = [ (k, v.get_value())  if type(v)==theano.tensor.sharedvar.TensorSharedVariable else (k,v) for k, v in list(self.__dict__.items())]
        params = dict(params)
        pickle.dump(params,open(path,"wb"))

        

In [33]:
s = {'lr':0.05,
         'nepochs':80,
         'seed':345,
         'nc':2,        # number of y classes
         'h_dim': 55,
         'h_dropout_rate': 0.5,
         'emb_dropout_rate': 0.5,
         'time_without_improvement': 10,
         'batch_size': 1,
         'w2v_DIM': "300",
         "rules_emb_dim": 0
         }  

dataType = 'int64'
  
np.random.seed(s['seed']) 

#ile_with_filtered_embeddings = "embeddings/filtered_nkjp+wiki-forms-restricted-300-cbow-ns.pkl"
#2vecs = pickle.load(open(file_with_filtered_embeddings,"rb"))

rnn = TreeLSTM( h_dim = s['h_dim'],
            nc = s['nc'],
        w2v_model_path = "embeddings/filtered_train_and_test_w2v_allwiki_nkjpfull_300.pkl",
            max_phrase_length = 1000,
        emb_dropout_rate = s['emb_dropout_rate'],
        h_dropout_rate = s['h_dropout_rate'],
        l = 0.0001,
        srng = RandomStreams(12345),
        file_with_rules =  "/home/norbert/Doktorat/SyntacticTreesDisambiguation/Składnica_preprocessed_training_data/rules.txt",
        rules_emb_dim = s["rules_emb_dim"],
        load_params= False#"/home/norbert/Doktorat/SyntacticTreesDisambiguation/Model/model_params_116.pkl"
    )



In [34]:
train_data[0][1]

array([[-1, -1, -1, -1],
       [-1, -1, -1, -1],
       [-1, -1, -1, -1],
       [-1, -1, -1, -1],
       [-1, -1, -1, -1],
       [-1, -1, -1, -1],
       [ 0, -1, -1, -1],
       [ 1, -1, -1, -1],
       [ 2, -1, -1, -1],
       [ 3, -1, -1, -1],
       [ 4, -1, -1, -1],
       [ 4, -1, -1, -1],
       [ 4, -1, -1, -1],
       [ 4, -1, -1, -1],
       [ 5, -1, -1, -1],
       [ 6, -1, -1, -1],
       [ 7, -1, -1, -1],
       [ 7, -1, -1, -1],
       [ 8, -1, -1, -1],
       [ 9, 10, -1, -1],
       [ 9, 11, -1, -1],
       [ 9, 12, -1, -1],
       [ 9, 13, -1, -1],
       [15, -1, -1, -1],
       [15, 17, -1, -1],
       [16, -1, -1, -1],
       [18, -1, -1, -1],
       [18, -1, -1, -1],
       [19, -1, -1, -1],
       [20, -1, -1, -1],
       [21, -1, -1, -1],
       [22, -1, -1, -1],
       [23, 25, 26, 28],
       [23, 25, 27, 29],
       [24, -1, -1, -1],
       [34, 26, 30, -1],
       [34, 27, 31, -1],
       [32, 33, 35, 36],
       [37, 14, -1, -1]], dtype=int32)

In [35]:
for e in range(10):

    tic = time.time()
    
    #random.shuffle(train_data)

    tic = time.time()
    for i in range(n_train):

        #if i % 100 == 0:
        print(i)
            
        rnn.train(train_data[i][0], train_data[i][0], train_data[i][1], train_data[i][2], train_data[i][3], train_data[i][4], s['lr'])

    loss = 0
    counts_test = np.zeros((s['nc'],s['nc']),dtype='int')
    for i in range(n_val):
        pred = rnn.classify(validation_data[i][0],validation_data[i][0],validation_data[i][1], validation_data[i][3], validation_data[i][4])
        for j in range(len(pred)):
            if validation_data[i][2][j]>=0:
                counts_test[pred[j], validation_data[i][2][j]] += 1
        
    
    # Train
    counts = np.zeros((s['nc'],s['nc']),dtype='int')
    for i in range(n_train):

        pred  = rnn.classify(train_data[i][0], train_data[i][0] ,train_data[i][1], train_data[i][3], train_data[i][4])
        for j in range(len(pred)):
            if train_data[i][2][j]>=0:
                counts[pred[j], train_data[i][2][j]] += 1
        

    #if e>0:# and e%3==0:
        #rnn.save_model("/home/norbert/Doktorat/SyntacticTreesDisambiguation/Model/model_forest_params_"+str(e)+".pkl")
        

    print(e, " Train: ", "%0.2f" % (100 * np.diag(counts).sum()/float(counts.sum())),
        " Valid all: ","%0.2f" % (100 * np.diag(counts_test).sum()/float(counts_test.sum())),
        "   time: ", time.time()-tic)

0
node __str__ = 0
h_aggregated __str__ = [ 0.04726947  0.00212552  0.04875937  0.03854429  0.04715015  0.04093371
  0.01110027 -0.04609891 -0.0274325  -0.00254248 -0.019198    0.04305265
 -0.01264124 -0.04472617 -0.03833277 -0.02297279 -0.0268264   0.04276826
 -0.02019623  0.04271401 -0.00853868 -0.02791818  0.00166526 -0.0228592
 -0.03774626 -0.03121801  0.02466259  0.01968496 -0.04103607 -0.03195578
 -0.02696705 -0.00689766  0.01202474 -0.00353032 -0.00763083 -0.03805416
 -0.00164491  0.00977198 -0.01370198  0.00572623  0.03771946  0.02355511
 -0.0089756  -0.03286357  0.02243487  0.04702048  0.00302503 -0.04868226
 -0.03132183 -0.02218989 -0.0493987   0.00818888 -0.02063473 -0.0187419
 -0.02212263]
F_C __str__ = [-0.00506874 -0.01318286 -0.00501794  0.01205531 -0.01047135 -0.02746134
  0.01357536  0.0089699   0.02261446  0.01141609  0.00860194  0.03618401
 -0.00608493  0.01238367  0.03381016  0.00120513  0.02444744  0.02917705
 -0.01251153 -0.01629903 -0.00132196  0.01613187 -0.0146

ValueError: An output of the scan has changed shape. This may be caused by a pushout optimization. Try adding 'optimizer_excluding=scanOp_pushout_output' to your Theano flags.
Apply node that caused the error: forall_inplace,cpu,grad_of_scan_fn}(Elemwise{Composite{minimum(minimum(minimum(minimum(minimum(i0, i1), i2), i3), i4), i5)}}.0, Elemwise{ge,no_inplace}.0, Elemwise{eq,no_inplace}.0, Alloc.0, Alloc.0, Subtensor{int64:int64:int64}.0, Subtensor{int64:int64:int64}.0, Subtensor{int64:int64:int64}.0, Subtensor{int64:int64:int64}.0, Subtensor{int64:int64:int64}.0, Subtensor{int64:int64:int64}.0, Subtensor{int64:int64:int64}.0, Subtensor{int64:int64:int64}.0, Subtensor{int64:int64:int64}.0, Subtensor{int64:int64:int64}.0, Alloc.0, Alloc.0, Alloc.0, Alloc.0, Alloc.0, Alloc.0, Alloc.0, Alloc.0, Alloc.0, Alloc.0, Alloc.0, Alloc.0, Alloc.0, Alloc.0, Alloc.0, Alloc.0, Alloc.0, Alloc.0, Alloc.0, Elemwise{Composite{minimum(minimum(minimum(minimum(minimum(i0, i1), i2), i3), i4), i5)}}.0, Elemwise{Composite{minimum(minimum(minimum(minimum(minimum(i0, i1), i2), i3), i4), i5)}}.0, Elemwise{Composite{minimum(minimum(minimum(minimum(minimum(i0, i1), i2), i3), i4), i5)}}.0, Elemwise{Composite{minimum(minimum(minimum(minimum(minimum(i0, i1), i2), i3), i4), i5)}}.0, Elemwise{Composite{minimum(minimum(minimum(minimum(minimum(i0, i1), i2), i3), i4), i5)}}.0, Elemwise{Composite{minimum(minimum(minimum(minimum(minimum(i0, i1), i2), i3), i4), i5)}}.0, Elemwise{Composite{minimum(minimum(minimum(minimum(minimum(i0, i1), i2), i3), i4), i5)}}.0, Elemwise{Composite{minimum(minimum(minimum(minimum(minimum(i0, i1), i2), i3), i4), i5)}}.0, Elemwise{Composite{minimum(minimum(minimum(minimum(minimum(i0, i1), i2), i3), i4), i5)}}.0, Elemwise{Composite{minimum(minimum(minimum(minimum(minimum(i0, i1), i2), i3), i4), i5)}}.0, Elemwise{Composite{minimum(minimum(minimum(minimum(minimum(i0, i1), i2), i3), i4), i5)}}.0, Elemwise{Composite{minimum(minimum(minimum(minimum(minimum(i0, i1), i2), i3), i4), i5)}}.0, Elemwise{Composite{minimum(minimum(minimum(minimum(minimum(i0, i1), i2), i3), i4), i5)}}.0, Elemwise{Composite{minimum(minimum(minimum(minimum(minimum(i0, i1), i2), i3), i4), i5)}}.0, <TensorType(float32, matrix)>, <TensorType(float32, matrix)>, <TensorType(float32, vector)>, <TensorType(float32, matrix)>, <TensorType(float32, vector)>, <TensorType(float32, matrix)>, <TensorType(float32, matrix)>, <TensorType(float32, vector)>, <TensorType(float32, matrix)>, <TensorType(float32, matrix)>, <TensorType(float32, vector)>, <TensorType(float32, matrix)>, <TensorType(float32, matrix)>, <TensorType(float32, vector)>, <TensorType(float32, matrix)>, <TensorType(float32, matrix)>, <TensorType(float32, vector)>, <TensorType(float32, matrix)>, <TensorType(float32, matrix)>, <TensorType(float32, vector)>, <TensorType(float32, matrix)>, <TensorType(float32, matrix)>, <TensorType(float32, matrix)>, <TensorType(float32, matrix)>, <TensorType(float32, vector)>, <TensorType(float32, vector)>, <TensorType(float32, matrix)>, <TensorType(float32, vector)>, InplaceDimShuffle{1,0}.0, InplaceDimShuffle{x,0}.0, InplaceDimShuffle{x,0}.0, CGemv{inplace}.0, InplaceDimShuffle{1,0}.0, InplaceDimShuffle{1,0}.0, InplaceDimShuffle{1,0}.0, InplaceDimShuffle{1,0}.0, InplaceDimShuffle{1,0}.0, InplaceDimShuffle{1,0}.0, InplaceDimShuffle{1,0}.0, InplaceDimShuffle{1,0}.0, InplaceDimShuffle{1,0}.0, InplaceDimShuffle{1,0}.0, InplaceDimShuffle{1,0}.0, InplaceDimShuffle{1,0}.0, InplaceDimShuffle{1,0}.0, InplaceDimShuffle{1,0}.0, InplaceDimShuffle{1,0}.0, InplaceDimShuffle{1,0}.0, Alloc.0, Alloc.0, InplaceDimShuffle{0,x}.0)
Toposort index: 388
Inputs types: [TensorType(int64, scalar), TensorType(bool, matrix), TensorType(bool, vector), TensorType(float32, 3D), TensorType(float32, vector), TensorType(int64, vector), TensorType(int64, vector), TensorType(int64, matrix), TensorType(int64, vector), TensorType(int64, vector), TensorType(int64, vector), TensorType(float32, 3D), TensorType(float32, 3D), TensorType(int32, 3D), TensorType(int32, 3D), TensorType(float32, vector), TensorType(float32, 3D), TensorType(float32, 3D), TensorType(float32, 3D), TensorType(float32, 3D), TensorType(float32, matrix), TensorType(float32, matrix), TensorType(float32, matrix), TensorType(float32, matrix), TensorType(float32, matrix), TensorType(float32, matrix), TensorType(float32, matrix), TensorType(float32, matrix), TensorType(float32, 3D), TensorType(float32, 3D), TensorType(float32, matrix), TensorType(float32, matrix), TensorType(float32, matrix), TensorType(float32, matrix), TensorType(int64, scalar), TensorType(int64, scalar), TensorType(int64, scalar), TensorType(int64, scalar), TensorType(int64, scalar), TensorType(int64, scalar), TensorType(int64, scalar), TensorType(int64, scalar), TensorType(int64, scalar), TensorType(int64, scalar), TensorType(int64, scalar), TensorType(int64, scalar), TensorType(int64, scalar), TensorType(int64, scalar), TensorType(float32, matrix), TensorType(float32, matrix), TensorType(float32, vector), TensorType(float32, matrix), TensorType(float32, vector), TensorType(float32, matrix), TensorType(float32, matrix), TensorType(float32, vector), TensorType(float32, matrix), TensorType(float32, matrix), TensorType(float32, vector), TensorType(float32, matrix), TensorType(float32, matrix), TensorType(float32, vector), TensorType(float32, matrix), TensorType(float32, matrix), TensorType(float32, vector), TensorType(float32, matrix), TensorType(float32, matrix), TensorType(float32, vector), TensorType(float32, matrix), TensorType(float32, matrix), TensorType(float32, matrix), TensorType(float32, matrix), TensorType(float32, vector), TensorType(float32, vector), TensorType(float32, matrix), TensorType(float32, vector), TensorType(float32, matrix), TensorType(float32, row), TensorType(float32, row), TensorType(float32, vector), TensorType(float32, matrix), TensorType(float32, matrix), TensorType(float32, matrix), TensorType(float32, matrix), TensorType(float32, matrix), TensorType(float32, matrix), TensorType(float32, matrix), TensorType(float32, matrix), TensorType(float32, matrix), TensorType(float32, matrix), TensorType(float32, matrix), TensorType(float32, matrix), TensorType(float32, matrix), TensorType(float32, matrix), TensorType(float32, matrix), TensorType(float32, matrix), TensorType(float32, matrix), TensorType(float32, vector), TensorType(float32, col)]
Inputs shapes: [(), (39, 4), (39,), (39, 39, 55), (39,), (39,), (39,), (39, 4), (39,), (39,), (39,), (39, 39, 55), (39, 39, 55), (39, 50, 6), (39, 9, 6), (39,), (40, 39, 55), (40, 39, 55), (40, 50, 6), (40, 9, 6), (2, 55), (2, 55), (2, 55), (2, 55), (2, 55), (2, 55), (2, 55), (2, 55), (2, 300, 55), (2, 55, 55), (2, 55), (2, 55), (2, 55), (2, 2), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (33404, 300), (300, 55), (55,), (55, 55), (55,), (300, 55), (55, 55), (55,), (300, 55), (55, 55), (55,), (300, 55), (55, 55), (55,), (300, 55), (55, 55), (55,), (300, 55), (55, 55), (55,), (300, 55), (55, 55), (300, 55), (55, 55), (55,), (55,), (55, 2), (2,), (55, 55), (1, 55), (1, 55), (55,), (2, 55), (55, 55), (55, 55), (55, 55), (55, 55), (55, 55), (55, 55), (55, 55), (55, 300), (55, 300), (55, 300), (55, 300), (55, 300), (55, 300), (55, 300), (55, 300), (33404, 300), (55,), (55, 1)]
Inputs strides: [(), (4, 1), (1,), (8580, 220, 4), (4,), (-8,), (-8,), (-32, 8), (-8,), (-8,), (-8,), (-8580, 220, 4), (-8580, 220, 4), (-1200, 24, 4), (-216, 24, 4), (4,), (8580, 220, 4), (8580, 220, 4), (1200, 24, 4), (216, 24, 4), (220, 4), (220, 4), (220, 4), (220, 4), (220, 4), (220, 4), (220, 4), (220, 4), (66000, 220, 4), (12100, 220, 4), (220, 4), (220, 4), (220, 4), (8, 4), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (1200, 4), (220, 4), (4,), (220, 4), (4,), (220, 4), (220, 4), (4,), (220, 4), (220, 4), (4,), (220, 4), (220, 4), (4,), (220, 4), (220, 4), (4,), (220, 4), (220, 4), (4,), (220, 4), (220, 4), (220, 4), (220, 4), (4,), (4,), (8, 4), (4,), (4, 220), (220, 4), (220, 4), (4,), (4, 8), (4, 220), (4, 220), (4, 220), (4, 220), (4, 220), (4, 220), (4, 220), (4, 220), (4, 220), (4, 220), (4, 220), (4, 220), (4, 220), (4, 220), (4, 220), (1200, 4), (4,), (4, 4)]
Inputs values: [array(39), 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', array([[0., 0.],
       [0., 0.]], dtype=float32), array(39), array(39), array(39), array(39), array(39), array(39), array(39), array(39), array(39), array(39), array(39), array(39), array(39), array(39), 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', array([-0.03832172, -0.04413849], dtype=float32), 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown', 'not shown']
Outputs clients: [[], [], [], [], [Subtensor{int64}(forall_inplace,cpu,grad_of_scan_fn}.4, ScalarFromTensor.0)], [Subtensor{int64}(forall_inplace,cpu,grad_of_scan_fn}.5, ScalarFromTensor.0)], [Subtensor{int64}(forall_inplace,cpu,grad_of_scan_fn}.6, ScalarFromTensor.0)], [Subtensor{int64}(forall_inplace,cpu,grad_of_scan_fn}.7, ScalarFromTensor.0)], [Subtensor{int64}(forall_inplace,cpu,grad_of_scan_fn}.8, ScalarFromTensor.0)], [Subtensor{int64}(forall_inplace,cpu,grad_of_scan_fn}.9, ScalarFromTensor.0)], [Subtensor{int64}(forall_inplace,cpu,grad_of_scan_fn}.10, ScalarFromTensor.0)], [Subtensor{int64}(forall_inplace,cpu,grad_of_scan_fn}.11, ScalarFromTensor.0)], [Subtensor{int64}(forall_inplace,cpu,grad_of_scan_fn}.12, ScalarFromTensor.0)], [Subtensor{int64}(forall_inplace,cpu,grad_of_scan_fn}.13, ScalarFromTensor.0)], [Subtensor{int64}(forall_inplace,cpu,grad_of_scan_fn}.14, ScalarFromTensor.0)], [Subtensor{int64}(forall_inplace,cpu,grad_of_scan_fn}.15, ScalarFromTensor.0)], [Subtensor{int64}(forall_inplace,cpu,grad_of_scan_fn}.16, ScalarFromTensor.0)], [Subtensor{int64}(forall_inplace,cpu,grad_of_scan_fn}.17, ScalarFromTensor.0)], [InplaceDimShuffle{1,0,2}(forall_inplace,cpu,grad_of_scan_fn}.18), Shape_i{1}(forall_inplace,cpu,grad_of_scan_fn}.18)], [Reshape{2}(forall_inplace,cpu,grad_of_scan_fn}.19, MakeVector{dtype='int64'}.0)], [InplaceDimShuffle{1,0,2}(forall_inplace,cpu,grad_of_scan_fn}.20), Shape_i{1}(forall_inplace,cpu,grad_of_scan_fn}.20)], [Reshape{2}(forall_inplace,cpu,grad_of_scan_fn}.21, MakeVector{dtype='int64'}.0), Shape_i{2}(forall_inplace,cpu,grad_of_scan_fn}.21)], [InplaceDimShuffle{1,0,2}(forall_inplace,cpu,grad_of_scan_fn}.22)], [Reshape{2}(forall_inplace,cpu,grad_of_scan_fn}.23, MakeVector{dtype='int64'}.0), Shape_i{2}(forall_inplace,cpu,grad_of_scan_fn}.23)], [Reshape{2}(forall_inplace,cpu,grad_of_scan_fn}.24, MakeVector{dtype='int64'}.0), Shape_i{2}(forall_inplace,cpu,grad_of_scan_fn}.24)], [Reshape{2}(forall_inplace,cpu,grad_of_scan_fn}.25, MakeVector{dtype='int64'}.0), Shape_i{2}(forall_inplace,cpu,grad_of_scan_fn}.25)], [InplaceDimShuffle{1,0,2}(forall_inplace,cpu,grad_of_scan_fn}.26), Shape_i{1}(forall_inplace,cpu,grad_of_scan_fn}.26)], [Reshape{2}(forall_inplace,cpu,grad_of_scan_fn}.27, MakeVector{dtype='int64'}.0), Shape_i{2}(forall_inplace,cpu,grad_of_scan_fn}.27)], [Reshape{2}(forall_inplace,cpu,grad_of_scan_fn}.28, MakeVector{dtype='int64'}.0), Shape_i{2}(forall_inplace,cpu,grad_of_scan_fn}.28)], [InplaceDimShuffle{1,0,2}(forall_inplace,cpu,grad_of_scan_fn}.29)], [Reshape{2}(forall_inplace,cpu,grad_of_scan_fn}.30, MakeVector{dtype='int64'}.0), Shape_i{1}(forall_inplace,cpu,grad_of_scan_fn}.30)], [Reshape{2}(forall_inplace,cpu,grad_of_scan_fn}.31, MakeVector{dtype='int64'}.0)]]

HINT: Re-running with most Theano optimization disabled could give you a back-trace of when this node was created. This can be done with by setting the Theano flag 'optimizer=fast_compile'. If that does not work, Theano optimizations can be disabled with 'optimizer=None'.
HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.

In [27]:
train_ind = pickle.load(open("/home/norbert/Doktorat/SyntacticTreesDisambiguation/Model/train_observations","rb"))
validation_ind = pickle.load(open("/home/norbert/Doktorat/SyntacticTreesDisambiguation/Model/validation_observations","rb"))

train_data = data[:10]#[data[i] for i in train_ind]
validation_data = data[10:]#[data[i] for i in validation_ind]

n_train = len(train_data)
n_val = len(validation_data)

n_train, n_val

(10, 9)

In [24]:
def _check_sentence(xml_tree, accept_tags=["forest","tree"]):
    
    """
    Funkcja sprawdza poprawnosc wypowiedzenia i arumentu: 
    - czy istnieje dla niego poprawne drzewo - wypowiedzenie jest poprawne jesli base_answer na polu "type" ma wartosc "FULL".
    - arumentem powinno byc drzewo o tagu korzenia rownym "forest" lub "tree".
    [W oryginalnych plikach z lasami jest to "forest", natomiast gdy z lasu tworzone sa pojedyncze drzewa,
    to maja one tag "tree"]
    
    xml_tree - las drzew lub drzewo [xml.etree.ElementTree.ElementTree]
    """
    
    if type(xml_tree) != ET.ElementTree:
        raise AssertionError("Argument xml_tree is not not ElementTree")
    
    
    if type(accept_tags) == str:
        accept_tags = [accept_tags]
    
    
    if not xml_tree.getroot().tag in accept_tags:
        raise AssertionError('Argument in not in [' + ",".join(accept_tags) + '] - it has tag "' + xml_tree.getroot().tag + '"' )
    
    
    base_answer_type = xml_tree.getroot().find('.//answer-data//base-answer').attrib["type"]
    correct = base_answer_type == "FULL"

    if not correct:
        raise AssertionError("Sentence is not correct: Node <base-answer> has type value " + base_answer_type  + " instead of 'FULL'")
        
    pass


def get_random_tree(forest, random_state=None):
    
    """
    Funkcja zwraca losowe drzewo z upakowanego lasu (forest).
    Dla lasu, w ktorym nie ma poprawnego drzewa funkcja wyrzuca blad.
    
    forest - las drzew [xml.etree.ElementTree.ElementTree]
    """

    # sprawdzenie poprawnosci lasu i ewentualne wypisanie komunikatu
    _check_sentence(forest,"forest")
    
    # ustawiamy ziarno
    if random_state is not None:
        np.random.seed(random_state)
            
            
    root_old = forest.getroot()
    root_new = ET.Element("tree",root_old.attrib)
    
    
    # las sklada sie z drzew (wezly "node") oraz dodatkowych danych (inne wezly) -
    # tresc wypowiedzenia, statystyki lasu, itd. - i tutaj przepisujemy te wezly
    features = root_old.getchildren()
    for feature in features:
        if feature.tag != "node": 
            feature_copy = deepcopy(feature)
            if feature_copy.tag == "stats":
                feature_copy.tag = "forest-stats"
                
            root_new.append(feature_copy) # modyfikujemy tag wezla wiec potrzebna kopia, zeby nie zmodyfikowac oryginalnego drzewa
    
    # definiujemy wezel ze statystykami drzewa
    # robimy to w tym iejscu zeby zachowac logiczna kolejnosc wezlow - zeby wypisywalo sie to na poczatku
    # wartosci nadamy nizej
    ET.SubElement(root_new, "tree-stats", {"height":"0","nodes":"0"})
            
            
    # definiujemy rekurencyjna funkcje, ktora bedzie przechodzic po lesie i
    # kolekcjonowac wezly, tworzac losowe drzewo.
    # drzewo jest tworzone na korzeniu root_new.
    def add_random_children(current_node_old):
        
        current_node_new = ET.SubElement(root_new, current_node_old.tag, current_node_old.attrib)
        
        features = current_node_old.getchildren()
        # kazdy "node" jest terminalem albo nieterminalem i ma opis wlasnosci
        # i tutaj wyciagamy te wlasnosci z wezla innego niz "children"
        for feature in features:
            if feature.tag != "children": 
                current_node_new.append(feature)
        
        children_old = current_node_old.findall("children")
        if len(children_old) == 0: #jestesmy w lisciu wiec konczymy dzialanie funkcji
            return None
        random_children_old = children_old[np.random.choice(len(children_old),1)[0]]
        random_children_new = ET.SubElement(current_node_new, random_children_old.tag, random_children_old.attrib)
        for child_old in random_children_old.getchildren():
            x = ET.SubElement(random_children_new, child_old.tag, child_old.attrib)
            next_node = root_old.find('.//node[@nid="' + x.attrib["nid"] + '"]')
            add_random_children(next_node)
        
    
        # wezel startowy (przyjmujemy, ze node z id=0 jest zawsze pierwszy):
    # TODO: upewnic sie czy to jest poprawne podejscie - czy moze byc inny wezel poczatkowym
    node_0 = root_old.find('.//node[@nid="0"]') 
    
    # konstruujemy drzewo:
    add_random_children(node_0)
    
    new_tree = ET.ElementTree(root_new)
    
    th = _tree_height(new_tree, node_id=0)
    
    root_new.find("tree-stats").attrib["height"] = str(th)
    root_new.find("tree-stats").attrib["nodes"] = str(len(root_new.findall("node")))
    
    return new_tree
       
 

def number_of_trees_in_forest(forest):

    """
    Funkcja zwraca liczbe drzew w lesie forest.
    
    forest - las drzew [xml.etree.ElementTree.ElementTree]
    """
    
    _check_sentence(forest,"forest")
    
    return int(forest.find("stats").attrib["trees"])
    
    
def get_random_negative_tree(forest, random_state=None):
    
    """
    Funkcja zwraca losowe negatywne (niepoprawne) drzewo z lasu forest.
    
    Gdy las sklada sie tylko z jednego drzewa (poprawnego) to zwracana jest wartosc None.
    
    forest - las drzew [xml.etree.ElementTree.ElementTree]
    """
    
    _check_sentence(forest,"forest")
    
    
    number_of_trees = number_of_trees_in_forest(forest)
    
    if number_of_trees == 1:
        Warning("There is only one tree in the forest")
        return None
    
    else:
        while True:
            tree = get_random_tree(forest,random_state)
            if not is_positive(tree):
                return tree
    
    

def get_subtree_label(tree, node):
    
    if node.find("children") is None:
        return 1
    
    if node.find("children").attrib.get("chosen","false") == "false":
        return 0
    else:
        return int(np.all([get_subtree_label(tree, tree.find(".//node[@nid='"+ x.attrib["nid"] + "']")) for x in node.find("children").findall("child")]))
        

In [43]:
def dependency_tree(tree):

    dep_tree, ids = terminals(tree)
    n_terminals = len(dep_tree)
    
    for nid in ids:

        parents = tree.findall(".//children/child[@nid='"+str(nid)+"']....")
        print(parents)
        for parent in parents:
            print(parent.attrib["nid"])
            if parent is not None:
                loc =  np.where([str(nid) in [x[0] for x in branch] and len(branch[-1])>=2 for branch in dep_tree])[0]
                print(ids)
                print(loc)
                if parent.attrib["nid"] not in ids:
                    ids.append(parent.attrib["nid"])


                if len(parent.findall("children/child"))==1:

                    dep_tree[loc[0]].append(tuple([parent.attrib["nid"]] +[x.text for x in parent.find("nonterminal").getchildren()]))

                    if parent.attrib["nid"] == "0":

                        if parent.attrib["nid"] not in [branch[0][0] for branch in dep_tree]:
                            dep_tree.append([tuple([parent.attrib["nid"]] +[x.text for x in parent.find("nonterminal").getchildren()])])


                else:

                    dep_tree[loc[0]].append((parent.attrib["nid"],))

                    if parent.attrib["nid"] not in [branch[0][0] for branch in dep_tree]:
                        dep_tree.append([tuple([parent.attrib["nid"]] +[x.text for x in parent.find("nonterminal").getchildren()])])
            else:
                pass
            #labels.append(get_subtree_label(tree, tree.find(".//node[@nid='" + str(nid) + "']")))
            
    heads = [x[-2][0] if len(x)>1 else x[0][0]  for x in dep_tree]       
    labels = [get_subtree_label(tree, tree.find(".//node[@nid='" + str(nid) + "']")) for nid in heads]

    return(dep_tree, labels, n_terminals)                                           


In [None]:
def transform_to_dependency_format(tree):
    
    dep_tree, labels, n_terminals = dependency_tree(tree)
    
    values = [[x[1] for x in branch[:-1]] for branch in dep_tree]
    
    top_node_ids = [branch[-1][0] for branch in dep_tree]
    
    for i in range(n_terminals,len(values)):
        if len(values[i])>0:
            values[i] = [get_head(tree, top_node_ids[i])]+values[i]
        else:
            values[i] = [get_head(tree, top_node_ids[i]),"__wypowiedzenie__"]


    tokens_and_rules = [(y[0],"-".join(y[1:])) if len(y)>1 else (y[0],"__brak__") for y in values]

    nodes_ids = [[x[0] for x in branch] for branch in dep_tree]
    
    parent_ids = [0]*len(nodes_ids)
    firsts = [x[0] for x in nodes_ids]


    for i in range(len(nodes_ids)):
        last = nodes_ids[i][-1]

        if len(nodes_ids[i])==1 and last == "0":
            parent_ids[i] = 0
        else:
            parent_ids[i] = np.where([last == x for x in firsts])[0][0] + 1 # "+1" po to zeby format danych zgadzal sie z tymi ze stanfordu 
                                                                            # - numerujemy tokeny od 1, a nie od 0

    nodes_used_in_tree = [x[0] for branch in dep_tree for x in branch]    
    
    dependency_data = list(zip([x[0] for x in tokens_and_rules],[x[1] for x in tokens_and_rules], parent_ids, labels))
    
    return(dependency_data, nodes_used_in_tree)

In [120]:
def write_dependency_format(dep_tree, folder, overwrite=False):
    
    if not overwrite:
        mode = "a+"
    else:
        mode = "w"
    
    tokens = [x[0] for x in dep_tree[0]]
    with open(folder+"/tokens.txt", mode) as f:
        f.write(" ".join(tokens) + "\n")
        
    rules = [x[1] for x in dep_tree[0]]
    with open(folder+"/rules.txt", mode) as f:
        f.write(" ".join(rules) + "\n")
        
    parents = [str(x[2]) for x in dep_tree[0]]
    with open(folder+"/parents.txt", mode) as f:
        f.write(" ".join(parents) + "\n")
        
    labels = [str(x[3]) for x in dep_tree[0]]
    with open(folder+"/labels.txt", mode) as f:
        f.write(" ".join(labels) + "\n")

    nodes_used_in_tree = dep_tree[1]
    with open(folder+"/nodes_used_in_tree.txt", mode) as f:
        f.write(" ".join(nodes_used_in_tree) + "\n")

In [121]:
write_dependency_format(dep_tree, "Data")

In [39]:
data_folder = "../Składnica-frazowa-171220/**/*.xml"

In [42]:
import glob

j = 1
trees, labels = [],[]
for filename in glob.iglob(data_folder, recursive=True):
    
    forest = ET.parse(filename)
 
    try:
        num_trees = number_of_trees_in_forest(forest)
        if num_trees<10 and num_trees>5:
            print(filename)
            break
        
        if num_trees<100000:
            
            if j % 100 ==0:
                print(j)
                print(num_trees)
            
            trees.append(get_positive_tree(forest))
            labels.append(1)
        
            
            if num_trees < 10:

                trees.append(get_random_negative_tree(forest))
                labels.append(0)

            elif num_trees<20:
                for i in range(3):
                    trees.append(get_random_negative_tree(forest))
                    labels.append(0)
            elif num_trees<30:
                for i in range(4):
                    trees.append(get_random_negative_tree(forest))
                    labels.append(0)
            elif num_trees<40:
                for i in range(5):
                    trees.append(get_random_negative_tree(forest))
                    labels.append(0)
            elif num_trees<10000:
                for i in range(10):
                    trees.append(get_random_negative_tree(forest))
                    labels.append(0)
               
            j += 1
        
    except:
        pass
    
    

    if j>1000:
        break

../Składnica-frazowa-171220/NKJP_1M_2002000137/morph_3-p/morph_3.36-s.xml


In [9]:
def _tree_height(xml_tree, node_id=0):
    
    """
    Funkcja oblicza wysokosc drzewa (dlugosc najdluzszej sciezki od korzenia do liscia)
    lub lasu (maximum z wszystkich mozliwych drzew)
    
    xml_tree - drzewo luba las drzew lub korzen drzewa jednego lub drugiego
    """
    
    
    if type(xml_tree)==ET.Element:
        node = tree
    else:
        node = tree.getroot()
        
    node = node.find('.//node[@nid="' + str(node_id) + '"]')
    children = node.findall(".//children//child")
    
    if len(children)==0:
        return 1
    else:
        children_nodes = [child.attrib["nid"] for child in children]
        return 1+max([_tree_height(tree,x) for x in children_nodes])
        

In [18]:
def number_of_nodes(tree):
    """
    Zwraca liczbe wezlow w drzewie.
    
    tree - drzewo lub korzen drzewa
    """
    if type(tree)==ET.Element:
        return len(tree.findall("node"))
    else:
        return len(tree.getroot().findall("node")) 

In [23]:
def get_subtree_label(tree, node):
    
    if node.find("children") is None:
        return 1
    
    if node.find("children").attrib.get("chosen","false") == "false":
        return 0
    else:
        return int(np.all([get_subtree_label(tree, tree.find(".//node[@nid='"+ x.attrib["nid"] + "']")) for x in node.find("children").findall("child")]))

In [20]:
def is_positive(tree): 
    
    """
    Funkcja sprawdza czy drzewo jest pozytywne - czy jest poprawnym drzewem rozbioru
    Zwraca wartosc logiczna.
    
    tree - drzewo [xml.etree.ElementTree.ElementTree]
    """
    
    _check_sentence(tree,"tree")
    
    assert len(tree.find("node"))>0, 'There is not "node" element in the tree'
    
    #Sprawdzamy czy wszystkie wezly "node" maja wartosc chosen="true":
    for x in tree.iter("node"):
        if not x.attrib["chosen"]=="true":
            return False

    #Sprawdzamy czy wszystkie wezly "children" maja wartosc chosen="true":
    for x in tree.iter(".//children"):
        if not x.attrib["chosen"]=="true":
            return False
        
    return True
        
        
    


def get_positive_tree(forest):
    
    """
    Funkcja zwraca poprawne (pozytywne) drzewo z upakowanego lasu (forest).
    Dla lasu, w ktorym nie ma poprawnego drzewa funkcja wyrzuca blad.
    
    forest - las drzew [xml.etree.ElementTree.ElementTree]
    """

    # sprawdzenie poprawnosci lasu i ewentualne wypisanie komunikatu
    _check_sentence(forest,"forest")
            
    root_old = forest.getroot()
    root_new = ET.Element("tree",root_old.attrib)
    
    
    # las sklada sie z drzew (wezly "node") oraz dodatkowych danych (inne wezly) -
    # tresc wypowiedzenia, statystyki lasu, itd. - i tutaj przepisujemy te wezly
    features = root_old.getchildren()
    for feature in features:
        if feature.tag != "node": 
            feature_copy = deepcopy(feature)
            if feature_copy.tag == "stats":
                feature_copy.tag = "forest-stats"
                
            root_new.append(feature_copy) # modyfikujemy tag wezla wiec potrzebna kopia, zeby nie zmodyfikowac oryginalnego drzewa
            
    # definiujemy rekurencyjna funkcje, ktora bedzie przechodzic po lesie i
    # kolekcjonowac wezly, tworzac losowe drzewo.
    # drzewo jest tworzone na korzeniu root_new.
    def add_positive_children(current_node_old):
        
        current_node_new = ET.SubElement(root_new, current_node_old.tag, current_node_old.attrib)
        
        features = current_node_old.getchildren()
        # kazdy "node" jest terminalem albo nieterminalem i ma opis wlasnosci
        # i tutaj wyciagamy te wlasnosci z wezla innego niz "children"
        for feature in features:
            if feature.tag != "children": 
                current_node_new.append(feature)
        
        
        children_old = current_node_old.findall('children[@chosen="true"]')
        # powinno byc tylko jedno takie dziecko
        
        assert len(children_old) <= 1, 'More than one children has chosen="true"'
        
        if len(children_old) == 0: #jestesmy w lisciu wiec konczymy dzialanie funkcji
            return None
        
        #random_children_old = children_old[np.random.choice(len(children_old),1)[0]]
        children_new = ET.SubElement(current_node_new, children_old[0].tag, children_old[0].attrib)
        for child_old in children_old[0].getchildren():
            x = ET.SubElement(children_new, child_old.tag, child_old.attrib)
            next_node = root_old.find('.//node[@nid="' + x.attrib["nid"] + '"]')
            assert next_node.attrib["chosen"] == "true"
            add_positive_children(next_node)
        
    
    # wezel startowy (przyjmujemy, ze node z id=0 jest zawsze pierwszy):
    # TODO: upewnic sie czy to jest poprawne podejscie - czy moze byc inny wezel poczatkowym
    node_0 = root_old.find('.//node[@nid="0"][@chosen="true"]') 
    
    # konstruujemy drzewo:
    add_positive_children(node_0)
    
    positive_tree = ET.ElementTree(root_new)

    # Sprawdzenie poprawnosci drzewa
    assert is_positive(positive_tree), """Something gone wrong - tree is not positive"""
        
        
    return positive_tree
    

In [96]:
def get_all_trees(forest):
    
    """
    Funkcja zwraca losowe drzewo z upakowanego lasu (forest).
    Dla lasu, w ktorym nie ma poprawnego drzewa funkcja wyrzuca blad.
    
    forest - las drzew [xml.etree.ElementTree.ElementTree]
    """

    # sprawdzenie poprawnosci lasu i ewentualne wypisanie komunikatu
    _check_sentence(forest,"forest")

            
    root_old = forest.getroot()
    root_new = ET.Element("tree",root_old.attrib)
    
    
    # las sklada sie z drzew (wezly "node") oraz dodatkowych danych (inne wezly) -
    # tresc wypowiedzenia, statystyki lasu, itd. - i tutaj przepisujemy te wezly
    features = root_old.getchildren()
    for feature in features:
        if feature.tag != "node": 
            feature_copy = deepcopy(feature)
            if feature_copy.tag == "stats":
                feature_copy.tag = "forest-stats"
                
            root_new.append(feature_copy) # modyfikujemy tag wezla wiec potrzebna kopia, zeby nie zmodyfikowac oryginalnego drzewa
    
    # definiujemy wezel ze statystykami drzewa
    # robimy to w tym iejscu zeby zachowac logiczna kolejnosc wezlow - zeby wypisywalo sie to na poczatku
    # wartosci nadamy nizej
    ET.SubElement(root_new, "tree-stats", {"height":"0","nodes":"0"})
            
            
    # definiujemy rekurencyjna funkcje, ktora bedzie przechodzic po lesie i
    # kolekcjonowac wezly, tworzac losowe drzewo.
    # drzewo jest tworzone na korzeniu root_new.
    
    trees = []
    
    def add_random_children(current_node_old,root_new):
        
        root_new_recurrent = deepcopy(root_new)
        
        current_node_new = ET.SubElement(root_new_recurrent, current_node_old.tag, current_node_old.attrib)
        
        features = current_node_old.getchildren()
        # kazdy "node" jest terminalem albo nieterminalem i ma opis wlasnosci
        # i tutaj wyciagamy te wlasnosci z wezla innego niz "children"
        for feature in features:
            if feature.tag != "children": 
                current_node_new.append(feature)
        
        children_old = current_node_old.findall("children")
        if len(children_old) == 0: #jestesmy w lisciu wiec konczymy dzialanie funkcji
            return root_new_recurrent
        
        #random_children_old = children_old[np.random.choice(len(children_old),1)[0]]
        #random_children_new = ET.SubElement(current_node_new, random_children_old.tag, random_children_old.attrib)
        
        
        
        for children in children_old:
            
            children_new = ET.SubElement(current_node_new, children.tag, children.attrib)
            
            for child_old in children.getchildren():
                x = ET.SubElement(children_new, child_old.tag, child_old.attrib)
                next_node = root_old.find('.//node[@nid="' + x.attrib["nid"] + '"]')
                if len(next_node.findall("children"))==0:
                    trees.append(ET.ElementTree(root_new_recurrent))
                else:
                    add_random_children(next_node,root_new_recurrent)
                    
        
        
    
        # wezel startowy (przyjmujemy, ze node z id=0 jest zawsze pierwszy):
    # TODO: upewnic sie czy to jest poprawne podejscie - czy moze byc inny wezel poczatkowym
    node_0 = root_old.find('.//node[@nid="0"]') 
    
    # konstruujemy drzewo:
    #add_random_children(node_0,root_new)
    
    #new_tree = ET.ElementTree(root_new)
    
    #th = _tree_height(new_tree, node_id=0)
    
    #root_new.find("tree-stats").attrib["height"] = str(th)
    #root_new.find("tree-stats").attrib["nodes"] = str(len(root_new.findall("node")))
    
    add_random_children(node_0,root_new)
    
    return trees