In [1]:
import pandas as pd
import numpy as np
import nltk

import pyconll


In [2]:
roles_ud = pd.read_csv(
    "../../decomp/protoroles_eng_ud1.2_11082016.tsv", delimiter="\t"
    ).drop(
        columns=["Is.Pilot", "Passes.Filters", "Protocol"]
    )
roles_ud = roles_ud.loc[roles_ud["Dataset"] == "bulkfiltered"]
roles_ud["Applicable"] = roles_ud["Applicable"].apply(lambda x: 1 if x == "yes" else 0)

roles_ud.head()


Unnamed: 0,Dataset,Split,Annotator.ID,Sentence.ID,Pred.Token,Pred.Lemma,Gram.Func,Arg.Phrase,Arg.Tokens.Begin,Arg.Tokens.End,Property,Response,Applicable,Sent.Grammatical
0,bulkfiltered,train,75,en-ud-train.conllu 5620,3,be,nsubj,the main reason,0,2,awareness,1.0,1,2.0
1,bulkfiltered,train,73,en-ud-train.conllu 4472,1,e-mail,dobj,your assistant,2,3,awareness,4.0,1,5.0
2,bulkfiltered,train,94,en-ud-train.conllu 9843,2,find,nsubj,i,1,1,awareness,5.0,1,5.0
3,bulkfiltered,train,56,en-ud-train.conllu 4484,4,exist,nsubj,other &quot; guidelines &quot;,0,3,awareness,1.0,0,5.0
4,bulkfiltered,train,75,en-ud-train.conllu 4484,4,exist,nsubj,other &quot; guidelines &quot;,0,3,awareness,1.0,1,5.0


In [3]:
from difflib import SequenceMatcher

def get_ud_sentences(ud):
    sentences = []
    for sample in ud:
        data = sample.conll()
        words = []
        for s in data.split("\n"):
            # in ud v2, lines include meta-data and non-word tokens
            if s[0] == "#" or not s.split("\t")[0].isnumeric():
                continue
            w = s.split("\t")[1]
            words.append(w)
        text = " ".join(words)
        # text = ''.join(ch for ch in text if ch.isalnum() or ch == " ")
        sentences.append(text)
    return sentences


In [4]:
def close(a, b):
    return SequenceMatcher(None, a, b).ratio() > 0.75

def get_close(s1, s2):
    i = 0
    j = 0
    s1_to_2 = {}
    words_map = {}
    while i < len(s1) and j < len(s2):
        if close(s1[i], s2[j]):
            s1_to_2[i] = j
            words_map[s1[i]] = s2[j]
            i += 1
            j += 1
        else:
            if close(s1[i + 1], s2[j]):
                # discard i
                i += 1
            elif close(s1[i], s2[j + 1]):
                # discard j
                j += 1
            else:
                raise

    return s1_to_2, words_map

def align_ud_versions(ud1, ud2):
    s1 = get_ud_sentences(ud1)
    s2 = get_ud_sentences(ud2)
    return s1, get_close(s1, s2)


In [None]:
from spacy.training.converters import conllu_to_docs

def convert_to_spacy(conll_file):
    data = open(conll_file, encoding="utf-8")
    annotations = data.read()
    docs = list(conllu_to_docs(annotations, no_print=True))
    sents = []
    for doc in docs:
        sents += doc.sents
    return sents

path = "../datasets/eng_web_tbk"
sets = ["train.conllu", "test.conllu", "dev.conllu"]
conlls_2 = {}
conlls_1 = {}
alignments = {}
s1_sentences = {}
spacy_parses = {}

for s in sets:
    # UD version 1.2
    c1_path = f'../datasets/UD_English/en-ud-{s}'
    # UD version 2
    c2_path = f'UD_English-EWT/en_ewt-ud-{s}'

    ud1 = pyconll.load_from_file(c1_path)
    ud2 = pyconll.load_from_file(c2_path)

    conlls_1[s] = ud1
    conlls_2[s] = ud2
    s1_sentences[s], alignments[s] = align_ud_versions(ud1, ud2)
    spacy_parses[s] = convert_to_spacy(c1_path)

    assert len(s1_sentences[s]) == len(spacy_parses[s])


In [None]:
def remove_traces(tree):
    new_tree = tree.copy(deep=True) 
    try:
        new_tree.collapse_unary(collapsePOS=True, collapseRoot=True)
    except:
        # already collapsed
        print(tree)
        raise
    i = 0
    for _, pos in new_tree.pos():
        if "-NONE-" in pos:
            tp = new_tree.leaf_treeposition(i)
            pos_parent_tp = tp[:-2]
            pos_parent_idx = tp[-2]
            new_tree[pos_parent_tp].pop(pos_parent_idx)
        else:
            # only increment position if leaf hasn't been removed
            i += 1
    return new_tree


def get_tree(conll_id):
    genre, file, sentence = conll_id.split("-")
    file_path = path + f"/data/{genre}/penntree/{file}.xml.tree"
    with open(file_path) as f:
        doc = f.readlines()
    try:
        parsed = doc[int(sentence) - 1][1:-2]
        tree = nltk.tree.ParentedTree.fromstring(parsed)
    except ValueError:
        # contains two sentences, add "SP" to indicate sentence pair
        parsed = doc[int(sentence) - 1]
        parsed = "(SP" + parsed[1:]
        tree = nltk.tree.ParentedTree.fromstring(parsed)
        print(" ".join(tree.leaves()))
    return remove_traces(tree)


def get_sentence_parse(row, decomp_id=None, arg=None):
    if decomp_id is None:
        decomp_id = row["Sentence.ID"].tolist()[0]
        arg = row["Arg.Phrase"].tolist()[0].lower()
    
    s, c_id = decomp_id.split(" ")
    c_id = int(c_id) - 1
    s = s.split("-")[-1]
    sent = s1_sentences[s][c_id]

    # get the ud2 index from the ud1 index
    mapped_id = alignments[s][0][c_id]
    c_sent = conlls_2[s][mapped_id]
    
    arg = arg.replace("&#39;s", "'s").replace("&quot;", '"').replace("&#39;", "'")
    if arg not in sent.lower():
        print(f"Warning: arg ({arg}) not in sentence ({sent.lower()}) for id {decomp_id}")
    
    tree = get_tree(c_sent.id)
    spacy_parse = spacy_parses[s][c_id]
    return tree, sent, spacy_parse


In [7]:
roles_wide = roles_ud.pivot_table(index=["Sentence.ID",  "Pred.Lemma", "Arg.Phrase", "Pred.Token", "Arg.Tokens.Begin", "Arg.Tokens.End", "Gram.Func", "Split", "Sent.Grammatical", "Annotator.ID"], columns="Property", values=["Response", "Applicable"], aggfunc=np.mean).reset_index()

roles_wide[["tree", "sentence", "spacy_parse"]] = roles_wide.apply(
    get_sentence_parse, axis=1, result_type='expand')

roles_wide.head()


  roles_wide = roles_ud.pivot_table(index=["Sentence.ID",  "Pred.Lemma", "Arg.Phrase", "Pred.Token", "Arg.Tokens.Begin", "Arg.Tokens.End", "Gram.Func", "Split", "Sent.Grammatical", "Annotator.ID"], columns="Property", values=["Response", "Applicable"], aggfunc=np.mean).reset_index()


  (NP-SBJ (-NONE- *PRO*))
  (VP (TO to) (VP (VB be) (ADJP-PRD (JJ specific))))))
  (NP-SBJ (-NONE- *PRO*))
  (VP (TO to) (VP (VB be) (ADJP-PRD (JJ specific))))))
  (NP-SBJ (-NONE- *PRO*))
  (VP (TO to) (VP (VB be) (ADJP-PRD (JJ specific))))))
  (NP-SBJ (-NONE- *PRO*))
  (VP (TO to) (VP (VB be) (ADJP-PRD (JJ specific))))))
  (NP-SBJ (-NONE- *PRO*))
  (VP (TO to) (VP (VB be) (ADJP-PRD (JJ specific))))))
  (NP-SBJ (-NONE- *PRO*))
  (VP (TO to) (VP (VB be) (ADJP-PRD (JJ specific))))))
  (NP-SBJ (-NONE- *PRO*))
  (VP (TO to) (VP (VB be) (ADJP-PRD (JJ specific))))))
  (NP-SBJ (-NONE- *PRO*))
  (VP (TO to) (VP (VB be) (ADJP-PRD (JJ specific))))))
  (NP-SBJ (-NONE- *PRO*))
  (VP (TO to) (VP (VB be) (ADJP-PRD (JJ specific))))))
  (NP-SBJ (-NONE- *PRO*))
  (VP (TO to) (VP (VB be) (ADJP-PRD (JJ specific))))))
  (NP-SBJ (-NONE- *PRO*))
  (VP (TO to) (VP (VB be) (ADJP-PRD (JJ specific))))))
  (NP-SBJ (-NONE- *PRO*))
  (VP (TO to) (VP (VB be) (ADJP-PRD (JJ specific))))))
  (NP-SBJ (-NONE- *PRO*))
  

Unnamed: 0_level_0,Sentence.ID,Pred.Lemma,Arg.Phrase,Pred.Token,Arg.Tokens.Begin,Arg.Tokens.End,Gram.Func,Split,Sent.Grammatical,Annotator.ID,...,Response,Response,Response,Response,Response,Response,Response,tree,sentence,spacy_parse
Property,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,...,existed_during,instigation,partitive,sentient,volition,was_for_benefit,was_used,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,en-ud-dev.conllu 1,come,this story,3,4,5,nsubj,dev,5.0,7,...,4.0,2.0,2.0,2.0,1.0,2.0,1.0,"[[[From], [(DT the), (NNP AP)]], [[comes]], [[...",From the AP comes this story :,"(From, the, AP, comes, this, story, :)"
1,en-ud-dev.conllu 1,come,this story,3,4,5,nsubj,dev,5.0,20,...,5.0,5.0,2.0,1.0,3.0,3.0,5.0,"[[[From], [(DT the), (NNP AP)]], [[comes]], [[...",From the AP comes this story :,"(From, the, AP, comes, this, story, :)"
2,en-ud-dev.conllu 100,cry,america,1,0,0,nsubj,dev,2.0,46,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,"[[[America], [(VBD cried), (NP+NN wolf), (PP-L...","America cried wolf in Iraq , and what 's scary...","(America, cried, wolf, in, Iraq, ,, and, what,..."
3,en-ud-dev.conllu 100,cry,america,1,0,0,nsubj,dev,5.0,15,...,5.0,5.0,5.0,5.0,5.0,3.0,5.0,"[[[America], [(VBD cried), (NP+NN wolf), (PP-L...","America cried wolf in Iraq , and what 's scary...","(America, cried, wolf, in, Iraq, ,, and, what,..."
4,en-ud-dev.conllu 100,cry,wolf,1,2,2,dobj,dev,5.0,26,...,3.0,3.0,3.0,1.0,3.0,3.0,3.0,"[[[America], [(VBD cried), (NP+NN wolf), (PP-L...","America cried wolf in Iraq , and what 's scary...","(America, cried, wolf, in, Iraq, ,, and, what,..."


In [8]:
def reduce_traces(terminals, num_tokens):
    n = 0
    i = 0
    while n != num_tokens:
        if terminals[i] != "-NONE-":
            n += 1
        i += 1
    return i

def get_arg(row):
    tree = row["tree"].tolist()[0]
    pred_token =  int(row["Pred.Token"].tolist()[0])
    start = int(row["Arg.Tokens.Begin"].tolist()[0])
    end = int(row["Arg.Tokens.End"].tolist()[0])
    arg = row["Arg.Phrase"].tolist()[0].lower()
    decomp_id = row["Sentence.ID"].tolist()[0]

    # predicate is inside of the argument, so drop
    if start <= pred_token and pred_token <= end:
        return np.nan

    # filter out any traces in the leaves
    leaves, terminals = zip(*tree.pos())
    inc_start = reduce_traces(terminals, start + 1)
    inc_end = reduce_traces(terminals, end + 1)

    indexed_arg = leaves[inc_start - 1: inc_end]
    indexed_arg = " ".join(
        list(filter(lambda x: "*" not in x,
                          indexed_arg))
                          ).replace("-LRB-", "(").replace("-RRB-", ")").lower()
    arg = arg.replace("&#39;s", "'s").replace("&quot;", '"').replace("&#39;", "'")
    if indexed_arg != arg:
        print(f"Warning: arg ({arg}) != indexed_arg ({indexed_arg}) in id ({decomp_id})")

    treepos = tree.treeposition_spanning_leaves(inc_start - 1, inc_end)
    subtree = tree[treepos]

    # leaf, so return immediate parent
    if isinstance(subtree, str):
        treepos = treepos[:-1]
        subtree = tree[treepos]

    # argument is not a syntactic constituent, so drop
    if (len(subtree.leaves()) != (inc_end - inc_start + 1) and \
          "*" not in subtree.leaves()[-1]) or subtree.label() == "VP" :
        print()
        print(inc_end - inc_start)
        print(subtree.leaves())
        print(leaves[inc_start - 1: inc_end])
        print(arg)
        return np.nan

    return subtree, treepos
    
get_arg(roles_wide.iloc[0])


(ParentedTree('NP-SBJ', [ParentedTree('DT', ['this']), ParentedTree('NN', ['story'])]),
 (2,))

In [9]:
roles_wide["Arg.Tree_treepos"] = roles_wide.apply(get_arg, axis=1)
print(len(roles_wide))
roles_wide = roles_wide.dropna().reset_index(drop=True)
print(len(roles_wide))

roles_wide["Arg.Tree"] = roles_wide["Arg.Tree_treepos"].apply(lambda x: x[0])
roles_wide["Arg.treepos"] = roles_wide["Arg.Tree_treepos"].apply(lambda x: x[1])
roles_wide.drop(columns=["Arg.Tree_treepos"], inplace=True)

roles_wide.head()



9
['They', 'have', 'fresh', 'flowers', ',', 'lasted', 'a', 'long', 'while', 'in', 'the', 'vase', ',', 'and', 'the', 'two', 'ladies', 'at', 'the', 'shop', 'know', 'the', 'business', 'well', '.']
('fresh', 'flowers', ',', 'lasted', 'a', 'long', 'while', 'in', 'the', 'vase')
fresh flowers , lasted a long while in the vase

9
['They', 'have', 'fresh', 'flowers', ',', 'lasted', 'a', 'long', 'while', 'in', 'the', 'vase', ',', 'and', 'the', 'two', 'ladies', 'at', 'the', 'shop', 'know', 'the', 'business', 'well', '.']
('fresh', 'flowers', ',', 'lasted', 'a', 'long', 'while', 'in', 'the', 'vase')
fresh flowers , lasted a long while in the vase

29
['received', 'an', 'extremely', 'uplifting', 'Christmas', 'present', 'this', 'year', ':', 'an', 'airplane', 'ride', 'just', 'for', 'him', ',', 'courtesy', 'of', 'the', 'International', 'Fund', 'for', 'Animal', 'Welfare', 'and', 'their', 'friends', "'", 'The', 'Bateleurs', "'"]
('an', 'extremely', 'uplifting', 'Christmas', 'present', 'this', 'year', '

  roles_wide.drop(columns=["Arg.Tree_treepos"], inplace=True)


Unnamed: 0_level_0,Sentence.ID,Pred.Lemma,Arg.Phrase,Pred.Token,Arg.Tokens.Begin,Arg.Tokens.End,Gram.Func,Split,Sent.Grammatical,Annotator.ID,...,Response,Response,Response,Response,Response,tree,sentence,spacy_parse,Arg.Tree,Arg.treepos
Property,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,...,partitive,sentient,volition,was_for_benefit,was_used,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,en-ud-dev.conllu 1,come,this story,3,4,5,nsubj,dev,5.0,7,...,2.0,2.0,1.0,2.0,1.0,"[[[From], [(DT the), (NNP AP)]], [[comes]], [[...",From the AP comes this story :,"(From, the, AP, comes, this, story, :)","[[this], [story]]","(2,)"
1,en-ud-dev.conllu 1,come,this story,3,4,5,nsubj,dev,5.0,20,...,2.0,1.0,3.0,3.0,5.0,"[[[From], [(DT the), (NNP AP)]], [[comes]], [[...",From the AP comes this story :,"(From, the, AP, comes, this, story, :)","[[this], [story]]","(2,)"
2,en-ud-dev.conllu 100,cry,america,1,0,0,nsubj,dev,2.0,46,...,5.0,5.0,5.0,5.0,5.0,"[[[America], [(VBD cried), (NP+NN wolf), (PP-L...","America cried wolf in Iraq , and what 's scary...","(America, cried, wolf, in, Iraq, ,, and, what,...",[America],"(0, 0)"
3,en-ud-dev.conllu 100,cry,america,1,0,0,nsubj,dev,5.0,15,...,5.0,5.0,5.0,3.0,5.0,"[[[America], [(VBD cried), (NP+NN wolf), (PP-L...","America cried wolf in Iraq , and what 's scary...","(America, cried, wolf, in, Iraq, ,, and, what,...",[America],"(0, 0)"
4,en-ud-dev.conllu 100,cry,wolf,1,2,2,dobj,dev,5.0,26,...,3.0,1.0,3.0,3.0,3.0,"[[[America], [(VBD cried), (NP+NN wolf), (PP-L...","America cried wolf in Iraq , and what 's scary...","(America, cried, wolf, in, Iraq, ,, and, what,...",[wolf],"(0, 1, 1)"


In [10]:
class Node:
    def __init__(self, name):
        self.name = name
        self.left = []
        self.right = []

    def to_string(self):
        left_string, right_string = "", ""
        if self.left:
            left_string = " ".join(l.to_string() for l in self.left) + " "
        if self.right:
            right_string = " " + " ".join(r.to_string() for r in self.right)
        return left_string + self.name + right_string
    

def filter_children(root):
    children = []
    for child in root.lefts:
        children.append((child, "left"))
    
    for child in root.rights:
        children.append((child, "right"))

    new_arg = Node(root.text)

    for child, side in children:
        # print(child, child.dep_)

        # keep dets
        if child.dep_ == "det":
            if side == "left":
                new_arg.left.append(Node(child.text))
            else:
                new_arg.right.append(Node(child.text))
        elif child.dep_ == "poss" or child.dep_ == "nmod:poss":
            # change poss to det
            if side == "left":
                new_arg.left.append(Node("the"))
            else:
                new_arg.right.append(Node("the"))
        # keep compounds, verbs, det
        elif child.dep_ in ["cc", "compound", "conj", "nummod",
                            "xcomp", "nsubj", "dobj", "mark"] or \
            child.pos_ in ["VERB", "AUX"]:
            if side == "left":
                node = filter_children(child)
                new_arg.left.append(node)
            else:
                node = filter_children(child)
                new_arg.right.append(node)
    
    return new_arg


In [11]:
def get_stripped(row, remove_all=False):
    if not remove_all:
        dep_parse = row["spacy_parse"].tolist()[0]
        tree = row["tree"].tolist()[0].copy(deep=True)
        arg_treepos = row["Arg.treepos"].tolist()[0]
        subtree = tree[arg_treepos]

        start = row["Arg.Tokens.Begin"].tolist()[0]
        end = row["Arg.Tokens.End"].tolist()[0]

        pred_idx = row["Pred.Token"].tolist()[0]
        predicate = dep_parse[pred_idx].text

        dep_span = dep_parse[start:end + 1]
        arg = dep_span.text
        stripped_arg = filter_children(dep_span.root).to_string()

        if not (isinstance(subtree, str) or "NP" in subtree.label().split("-")[0] or (subtree.parent() and "NP" in subtree.parent().label())):
            print()
            print(subtree.label())
            print(" ".join(subtree.leaves()))
            print(stripped_arg)

        return predicate, arg, stripped_arg, dep_span 
    else:
        raise NotImplemented


In [12]:
roles_wide[
    ["Predicate", "Arg.Phrase", "Arg.Stripped", "Arg.Parse"]
        ] =  roles_wide.apply(get_stripped, axis=1, result_type='expand')


roles_wide.head()



SBAR-NOM-SBJ
What you say
What you say

SBAR-NOM-SBJ
What you say
What you say

SBAR-NOM-SBJ
What they wonder
What they wonder

SBAR-NOM-SBJ
What I love most about this place , other than the food
What I love

SBAR-NOM-SBJ
What I love most about this place , other than the food
What I love

S
us feel secure in our decision to buy when we did
us feel secure

S
us feel secure in our decision to buy when we did
us feel secure

SBAR
what I wanted and that I previously went to Braman Honda
what I wanted and that I went

SBAR
what I wanted and that I previously went to Braman Honda
what I wanted and that I went

SBAR-NOM-SBJ
What I like most about Dr. Liau
What I like

SBAR-NOM-SBJ
What I like most about Dr. Liau
What I like

ADJP-PRD+JJR
bigger
bigger

ADJP-PRD+JJR
bigger
bigger

SBAR-NOM
WHAT YOU PAY FOR
WHAT YOU PAY

SBAR-NOM
WHAT YOU PAY FOR
WHAT YOU PAY

UCP
at least this much , in this crisis , to cease militarily unnecessary provocations and establish genuine peace
much to cease prov

Unnamed: 0_level_0,Sentence.ID,Pred.Lemma,Arg.Phrase,Pred.Token,Arg.Tokens.Begin,Arg.Tokens.End,Gram.Func,Split,Sent.Grammatical,Annotator.ID,...,Response,Response,tree,sentence,spacy_parse,Arg.Tree,Arg.treepos,Predicate,Arg.Stripped,Arg.Parse
Property,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,...,was_for_benefit,was_used,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,en-ud-dev.conllu 1,come,this story,3,4,5,nsubj,dev,5.0,7,...,2.0,1.0,"[[[From], [(DT the), (NNP AP)]], [[comes]], [[...",From the AP comes this story :,"(From, the, AP, comes, this, story, :)","[[this], [story]]","(2,)",comes,this story,"(this, story)"
1,en-ud-dev.conllu 1,come,this story,3,4,5,nsubj,dev,5.0,20,...,3.0,5.0,"[[[From], [(DT the), (NNP AP)]], [[comes]], [[...",From the AP comes this story :,"(From, the, AP, comes, this, story, :)","[[this], [story]]","(2,)",comes,this story,"(this, story)"
2,en-ud-dev.conllu 100,cry,America,1,0,0,nsubj,dev,2.0,46,...,5.0,5.0,"[[[America], [(VBD cried), (NP+NN wolf), (PP-L...","America cried wolf in Iraq , and what 's scary...","(America, cried, wolf, in, Iraq, ,, and, what,...",[America],"(0, 0)",cried,America,(America)
3,en-ud-dev.conllu 100,cry,America,1,0,0,nsubj,dev,5.0,15,...,3.0,5.0,"[[[America], [(VBD cried), (NP+NN wolf), (PP-L...","America cried wolf in Iraq , and what 's scary...","(America, cried, wolf, in, Iraq, ,, and, what,...",[America],"(0, 0)",cried,America,(America)
4,en-ud-dev.conllu 100,cry,wolf,1,2,2,dobj,dev,5.0,26,...,3.0,3.0,"[[[America], [(VBD cried), (NP+NN wolf), (PP-L...","America cried wolf in Iraq , and what 's scary...","(America, cried, wolf, in, Iraq, ,, and, what,...",[wolf],"(0, 1, 1)",cried,wolf,(wolf)


In [13]:
def save_spacy_parse(parse):
    return parse.as_doc().to_json()

roles_wide["spacy_parse"] = roles_wide["spacy_parse"].apply(save_spacy_parse)
roles_wide["Arg.Parse"] = roles_wide["Arg.Parse"].apply(save_spacy_parse)
roles_wide.drop(columns=["Arg.Stripped"]).to_csv("decomp_ud_trees.csv")


  roles_wide.drop(columns=["Arg.Stripped"]).to_csv("decomp_ud_trees.csv")


In [None]:
import conllu

def read_conllu(conll_file, propbank = True):
    data = open(conll_file, encoding="utf-8")
    text = data.read()

    fields = ['id', 'form', 'lemma', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc']
    if propbank:
        fields += ['frame', 'role']

    sentences = conllu.parse(
        text,
        fields = fields
        )
    return sentences

conlls_propbank = {}
conlls = {}

sets = ["train.conllu", "test.conllu", "dev.conllu"]
for s in sets:
    conlls_propbank[s] = read_conllu("UP_English-EWT/en_ewt-up-" + s)
    conlls[s] = read_conllu("../datasets/UD_English/en-ud-" + s, propbank=False)


In [15]:
def get_roleset(sentence_id, pred_token, lemma):
    dataset, index = sentence_id.split(" ")
    dataset = dataset.split("-")[-1]
    verb = conlls_propbank[dataset][int(index) - 1][pred_token]
    if verb["lemma"] != lemma:
        print(sentence_id, pred_token, lemma, verb)
    if "frame" in verb:
        return verb["frame"]
    else:
        print("warning: missing frame, ", verb)
        return "N/A"

roles_wide["Roleset"] = roles_wide.apply(
                            lambda x: get_roleset(x["Sentence.ID"][0], x["Pred.Token"][0], x["Pred.Lemma"][0]),
                            axis=1
                        )

roles_wide.head()


  lambda x: get_roleset(x["Sentence.ID"][0], x["Pred.Token"][0], x["Pred.Lemma"][0]),


en-ud-dev.conllu 1304 1 se se
en-ud-dev.conllu 1304 1 se se
en-ud-train.conllu 10228 2 fixede fixeded
en-ud-train.conllu 10228 2 fixede fixeded
en-ud-train.conllu 10228 2 fixede fixeded
en-ud-train.conllu 10228 2 fixede fixeded
en-ud-train.conllu 10432 1 reccomend reccomend
en-ud-train.conllu 10432 1 reccomend reccomend
en-ud-train.conllu 10432 1 reccomend reccomend
en-ud-train.conllu 10432 1 reccomend reccomend
en-ud-train.conllu 11217 4 where where
en-ud-train.conllu 11217 4 where where
en-ud-train.conllu 11808 1 thouhgt thouhgt
en-ud-train.conllu 11808 1 thouhgt thouhgt
en-ud-train.conllu 12055 1 precede preceded
en-ud-train.conllu 12055 1 precede preceded
en-ud-train.conllu 7411 2 wan wan
en-ud-train.conllu 7411 2 wan wan


Unnamed: 0_level_0,Sentence.ID,Pred.Lemma,Arg.Phrase,Pred.Token,Arg.Tokens.Begin,Arg.Tokens.End,Gram.Func,Split,Sent.Grammatical,Annotator.ID,...,Response,tree,sentence,spacy_parse,Arg.Tree,Arg.treepos,Predicate,Arg.Stripped,Arg.Parse,Roleset
Property,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,...,was_used,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,en-ud-dev.conllu 1,come,this story,3,4,5,nsubj,dev,5.0,7,...,1.0,"[[[From], [(DT the), (NNP AP)]], [[comes]], [[...",From the AP comes this story :,"{'text': 'From the AP comes this story : ', 's...","[[this], [story]]","(2,)",comes,this story,"{'text': 'this story ', 'sents': [{'start': 0,...",come.03
1,en-ud-dev.conllu 1,come,this story,3,4,5,nsubj,dev,5.0,20,...,5.0,"[[[From], [(DT the), (NNP AP)]], [[comes]], [[...",From the AP comes this story :,"{'text': 'From the AP comes this story : ', 's...","[[this], [story]]","(2,)",comes,this story,"{'text': 'this story ', 'sents': [{'start': 0,...",come.03
2,en-ud-dev.conllu 100,cry,America,1,0,0,nsubj,dev,2.0,46,...,5.0,"[[[America], [(VBD cried), (NP+NN wolf), (PP-L...","America cried wolf in Iraq , and what 's scary...","{'text': 'America cried wolf in Iraq, and what...",[America],"(0, 0)",cried,America,"{'text': 'America ', 'sents': [{'start': 0, 'e...",cry.01
3,en-ud-dev.conllu 100,cry,America,1,0,0,nsubj,dev,5.0,15,...,5.0,"[[[America], [(VBD cried), (NP+NN wolf), (PP-L...","America cried wolf in Iraq , and what 's scary...","{'text': 'America cried wolf in Iraq, and what...",[America],"(0, 0)",cried,America,"{'text': 'America ', 'sents': [{'start': 0, 'e...",cry.01
4,en-ud-dev.conllu 100,cry,wolf,1,2,2,dobj,dev,5.0,26,...,3.0,"[[[America], [(VBD cried), (NP+NN wolf), (PP-L...","America cried wolf in Iraq , and what 's scary...","{'text': 'America cried wolf in Iraq, and what...",[wolf],"(0, 1, 1)",cried,wolf,"{'text': 'wolf ', 'sents': [{'start': 0, 'end'...",cry.01


In [16]:
def get_arg_nums(sentence_id, arg_start_idx, arg_end_idx):
    dataset, index = sentence_id.split(" ")
    dataset = dataset.split("-")[-1]
    index = int(index) - 1

    # iterate through all tokens in arg to find the role
    for i in range(arg_start_idx, arg_end_idx + 1):
        arg = conlls_propbank[dataset][index][i]

        if "role" in arg and "ARG" in arg["role"]:
            return arg["role"][-1]
        
    print("Warning: could not find ARG role ", sentence_id, arg_start_idx, arg_end_idx)
    return np.nan

roles_wide["Arg"] = roles_wide.apply(
                            lambda x: get_arg_nums(
                                x["Sentence.ID"][0],
                                x["Arg.Tokens.Begin"][0],
                                x["Arg.Tokens.End"][0]),
                            axis=1
                        )
print(roles_wide.shape)
# roles_wide = roles_wide.dropna(subset=[("Arg", "")])
roles_wide.head()




  x["Sentence.ID"][0],
  x["Arg.Tokens.Begin"][0],
  x["Arg.Tokens.End"][0]),


(7499, 48)


Unnamed: 0_level_0,Sentence.ID,Pred.Lemma,Arg.Phrase,Pred.Token,Arg.Tokens.Begin,Arg.Tokens.End,Gram.Func,Split,Sent.Grammatical,Annotator.ID,...,tree,sentence,spacy_parse,Arg.Tree,Arg.treepos,Predicate,Arg.Stripped,Arg.Parse,Roleset,Arg
Property,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,...,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,en-ud-dev.conllu 1,come,this story,3,4,5,nsubj,dev,5.0,7,...,"[[[From], [(DT the), (NNP AP)]], [[comes]], [[...",From the AP comes this story :,"{'text': 'From the AP comes this story : ', 's...","[[this], [story]]","(2,)",comes,this story,"{'text': 'this story ', 'sents': [{'start': 0,...",come.03,1
1,en-ud-dev.conllu 1,come,this story,3,4,5,nsubj,dev,5.0,20,...,"[[[From], [(DT the), (NNP AP)]], [[comes]], [[...",From the AP comes this story :,"{'text': 'From the AP comes this story : ', 's...","[[this], [story]]","(2,)",comes,this story,"{'text': 'this story ', 'sents': [{'start': 0,...",come.03,1
2,en-ud-dev.conllu 100,cry,America,1,0,0,nsubj,dev,2.0,46,...,"[[[America], [(VBD cried), (NP+NN wolf), (PP-L...","America cried wolf in Iraq , and what 's scary...","{'text': 'America cried wolf in Iraq, and what...",[America],"(0, 0)",cried,America,"{'text': 'America ', 'sents': [{'start': 0, 'e...",cry.01,0
3,en-ud-dev.conllu 100,cry,America,1,0,0,nsubj,dev,5.0,15,...,"[[[America], [(VBD cried), (NP+NN wolf), (PP-L...","America cried wolf in Iraq , and what 's scary...","{'text': 'America cried wolf in Iraq, and what...",[America],"(0, 0)",cried,America,"{'text': 'America ', 'sents': [{'start': 0, 'e...",cry.01,0
4,en-ud-dev.conllu 100,cry,wolf,1,2,2,dobj,dev,5.0,26,...,"[[[America], [(VBD cried), (NP+NN wolf), (PP-L...","America cried wolf in Iraq , and what 's scary...","{'text': 'America cried wolf in Iraq, and what...",[wolf],"(0, 1, 1)",cried,wolf,"{'text': 'wolf ', 'sents': [{'start': 0, 'end'...",cry.01,1


In [17]:
# roles_wide = roles_wide.drop(columns=["tree", "spacy_parse", "Arg.Tree", "Arg.treepos"])
roles_wide.to_csv("decomp_ud_modified_sentences.csv")
