In [2]:
import nltk
from nltk.corpus import treebank
import regex as re
import csv

In [3]:
def print_tree_nltk_treebank(doc_id):
    nltk.download('treebank')
    for tree in treebank.parsed_sents(doc_id):
        tree.pretty_print()

def print_leaves_nltk_treebank(doc_id):
    nltk.download('treebank')
    for tree in treebank.parsed_sents(doc_id):
        print(tree.leaves())

In [4]:
class Node:
    def __init__(self, parent=None, token=None, label=None):
      self.children = []
      self.parent = parent
      self.token = token
      self.label = label

In [54]:
def generate_sentence_list_rep_from_ptb_mrg_file(file_path, sen_ind):
    with open(file_path) as f:
        lines = f.readlines()
        indices = []
        for i in range(len(lines)):
            if lines[i] == '( (S \n':
                indices.append(i)
        sentences = []
        for i in range(len(indices)):
            if i == len(indices) - 1:
                sentences.append(lines[indices[i]:])
            else:
                sentences.append(lines[indices[i]:indices[i+1]])
    print(sen_ind)
    return sentences[sen_ind]

def strip_sentence_list_rep(sen):
    stripped_sentence = []
    for line in sen:
        line = line.strip()
        line = re.split('([\(\)])', line)
        line = [x for x in line if (x != '' and x != ' ')]
        stripped_sentence.append(line)
    return stripped_sentence

def flatten_sentence_list_rep(sen):
    fsen = [item for sublist in sen for item in sublist]
    fsen = fsen[1:len(fsen) - 1]
    return fsen

def generate_tree_from_flattened_sentence_list_rep(sen):
    tree = Node()
    current_node = tree
    for i in sen:
        if i == '(':
            new_node = Node(parent=current_node)
            current_node.children.append(new_node)
            current_node = new_node
        elif i == ')':
            current_node = current_node.parent
        else:
            i_list = i.strip().split(" ")
            current_node.label = i_list[0]
            if len(i_list) > 1:
                current_node.token = i_list[1]
    return tree

def generate_ptb_parse_tree(file_path, sen_id):
    sentence_list_rep = generate_sentence_list_rep_from_ptb_mrg_file(file_path, sen_id)
    stripped_sentence_rep = strip_sentence_list_rep(sentence_list_rep)
    flat_sentence_rep = flatten_sentence_list_rep(stripped_sentence_rep)
    tree = generate_tree_from_flattened_sentence_list_rep(flat_sentence_rep)
    return tree


def return_token(leaf):
    return leaf.token

def return_node(leaf):
    return leaf

def apply_f_to_leaves_spanned_by_subtree(subtree, f):
    unvisited = []
    returns = []
    children = subtree.children
    for child in reversed(children):
        unvisited.insert(0, child)
    while len(unvisited) != 0:
        node = unvisited.pop(0)
        if node.token is not None:
            returns.append(f(node))
        else:
            children = node.children
            for child in reversed(children):
                unvisited.insert(0, child)
    return returns

In [59]:
def identify_form(char):
    if char == 'i':
        return 'infinitive'
    elif char == 'g':
        return 'gerund'
    elif char == 'p':
        return 'participle'
    elif char == 'v':
        return 'finite'
    else:
        return '-'
    
def identify_tense(char):
    if char == 'f':
        return 'future'
    elif char == 'p':
        return 'past'
    elif char == 'n':
        return 'present'
    else:
        return '-'
    
def identify_aspect(char):
    if char == 'p':
        return 'perfect'
    elif char == 'o':
        return 'progressive'
    elif char == 'b':
        return 'both perfect and progressive'
    else:
        return '-'
    
def identify_person(char):
    if char == '3':
        return '3rd person'
    else:
        return '-'
    
def identify_voice(char):
    if char == 'a':
        return 'active'
    elif char == 'p':
        return 'passive'
    else:
        return '-'

def parse_inflection(inflection):
    keys = ["form","tense","aspect","person","voice"]
    inflection = [char for char in inflection]
    print(inflection)
    inflection_dict = {}
    for i in range(len(keys)):
        inflection_dict[keys[i]] = globals()[f"identify_{keys[i]}"](inflection[i])
    return inflection_dict

def parse_arguments(arguments):
    argument_dict = {}
    for arg in arguments:
        arg = arg.split("-")
        argument_dict['-'.join(arg[1:]).strip()] = arg[0]
    return argument_dict

def parse_row(row):
    row = row.split(' ')
    ann_dict ={}
    ann_dict["wsj_filepath"] = row[0]
    ann_dict["sen_id"] = int(row[1])
    ann_dict["pred_loc"] = int(row[2])
    ann_dict["tagger"] = row[3] 
    ann_dict["frameset"] = row[4]
    ann_dict["inflection"] = parse_inflection(row[5])
    ann_dict["arguments"] = parse_arguments(row[6:])
    return ann_dict

def gen_ptb_file_path(file_path):
    file = re.sub(r'wsj/[0-9]{2}/', '', file_path)
    return "../LDC-Data/NLTK-PTB-Sample/" + file

def type_propbank_ann(ann):
    if re.match('^[0-9]+:[0-9]+$', ann) is not None:
        return 1
    elif re.match('^[0-9]+:[0-9]+\*[0-9]+:[0-9]+$', ann) is not None:
        return 2
    elif re.match('^[0-9]+:[0-9]+,[0-9]+:[0-9]+$', ann) is not None:
        return 3
    else:
        return 4

def get_tokens_for_type_one_ann(index, leaves):
    leaf = int(index.split(":")[0])
    height = int(index.split(":")[1])
    terminal = leaves[int(index.split(':')[0])]
    height = int(index.split(':')[1])
    if height == 0:
        tokens = [terminal.token] 
    else:
        parent_node = terminal
        for i in range(0, height):
            parent_node = parent_node.parent
        tokens = apply_f_to_leaves_spanned_by_subtree(parent_node, return_token)
    return tokens
    
def gen_propbank_labels(row):
    #Annotation Dictionary
    ann_dict = parse_row(row)
    #Tree
    file_path = gen_ptb_file_path(ann_dict["wsj_filepath"])
    tree = generate_ptb_parse_tree(file_path, ann_dict["sen_id"])
    #Tokens and Leaves of Tree
    tokens = apply_f_to_leaves_spanned_by_subtree(tree, return_token)
    leaves = apply_f_to_leaves_spanned_by_subtree(tree, return_node)
    #Labels 
    label_dict = {key:None for key in tokens}
    argument_dict = ann_dict["arguments"]
    for label, indices in argument_dict.items():
        ann_type = type_propbank_ann(indices)
        if ann_type == 1:
            tokens = get_tokens_for_type_one_ann(indices, leaves)
            for tok in tokens:
                label_dict[tok] = label
    print(label_dict)
            
            


with open("../LDC-Data/LDC2004T14/propbank_1/data/prop.txt") as f:
    for line in f:
        print(line)
        gen_propbank_labels(line)
        break
    

wsj/00/wsj_0001.mrg 0 8 gold join.01 vf--a 0:2-ARG0 7:0-ARGM-MOD 8:0-rel 9:1-ARG1 11:1-ARGM-PRD 15:1-ARGM-TMP

['v', 'f', '-', '-', 'a']
0
{'Pierre': 'ARG0', 'Vinken': 'ARG0', ',': 'ARG0', '61': 'ARG0', 'years': 'ARG0', 'old': 'ARG0', 'will': 'ARGM-MOD', 'join': 'rel', 'the': 'ARG1', 'board': 'ARG1', 'as': 'ARGM-PRD', 'a': 'ARGM-PRD', 'nonexecutive': 'ARGM-PRD', 'director': 'ARGM-PRD', 'Nov.': 'ARGM-TMP', '29': 'ARGM-TMP', '.': None}


['( (S \n', '    (NP-SBJ (NNP Mr.) (NNP Vinken) )\n', '    (VP (VBZ is) \n', '      (NP-PRD \n', '        (NP (NN chairman) )\n', '        (PP (IN of) \n', '          (NP \n', '            (NP (NNP Elsevier) (NNP N.V.) )\n', '            (, ,) \n', '            (NP (DT the) (NNP Dutch) (VBG publishing) (NN group) )))))\n', '    (. .) ))\n']
[['(', '(', 'S'], ['(', 'NP-SBJ ', '(', 'NNP Mr.', ')', '(', 'NNP Vinken', ')', ')'], ['(', 'VP ', '(', 'VBZ is', ')'], ['(', 'NP-PRD'], ['(', 'NP ', '(', 'NN chairman', ')', ')'], ['(', 'PP ', '(', 'IN of', ')'], ['(', 'NP'], ['(', 'NP ', '(', 'NNP Elsevier', ')', '(', 'NNP N.V.', ')', ')'], ['(', ', ,', ')'], ['(', 'NP ', '(', 'DT the', ')', '(', 'NNP Dutch', ')', '(', 'VBG publishing', ')', '(', 'NN group', ')', ')', ')', ')', ')', ')'], ['(', '. .', ')', ')', ')']]
['(', 'S', '(', 'NP-SBJ ', '(', 'NNP Mr.', ')', '(', 'NNP Vinken', ')', ')', '(', 'VP ', '(', 'VBZ is', ')', '(', 'NP-PRD', '(', 'NP ', '(', 'NN chairman', ')', ')', '(', 'PP ', '(', 

In [None]:
        '''
        print("Node (by label) Removed")
        print(node.label)
        print("Nodes Remaining (by label) in Unvisited List")
        for i in unvisited:
            print(i.label)
        '''
            #print("My Node is a leaf node")
            '''
            print("Token associated with leaf node")
            print(node.token)
            '''
            #print("My Node is not a leaf node")
            '''
            print("Node's children")
            for child in children:
                print(child.label)
            '''
                '''
                print("Initial Nodes (by label) in Unvisited List")
                for i in unvisited:
                    print(i.label)
                '''
                '''
                print("Nodes (by label) in Unvisited List after Insert")
                for i in unvisited:
                    print(i.label)
                '''