In [1]:
import pandas as pd
import numpy as np
import nltk
import pattern.en 
import lemminflect

from nltk.corpus.reader.propbank import PropbankTreePointer

import spacy
import re


In [2]:
# use spacy parsing model
spacy_nlp = spacy.load('en_core_web_lg')

# use custom tokenizer that splits only at spaces
spacy_nlp.tokenizer = spacy.tokenizer.Tokenizer(
    spacy_nlp.vocab, token_match=re.compile(r'\S+').match)


In [3]:
roles_pb = pd.read_csv("../../decomp/protoroles_eng_pb_08302015.tsv", delim_whitespace="\t")


  roles_pb = pd.read_csv("../../decomp/protoroles_eng_pb_08302015.tsv", delim_whitespace="\t")


In [4]:
path = "/Users/lli/Desktop/code/datasets/penn_treebank/package/treebank_3/tagged/pos/wsj/"

def get_sentence(id):
    file_id, sent_id = id.split("_")
    file_path = path + f"{file_id[:2]}/wsj_{file_id}.pos"
    with open(file_path, mode="r") as f:
        lines = list(
            filter(
                lambda x: x != "", (map(lambda x: x.strip().replace("=", "/. "), f.readlines()))
                )
            )
        lines = list(filter(
            lambda x: x != "",
            " ".join(
                filter(
                    lambda x: x not in ["[", "]"],
                    " ".join(lines).split(" ")
                    )
                ).split("/. ")
            ))
        # return lines
        lines = list(
            map(
                lambda x: list(map(
                    lambda y: y.split("/")[0],
                    x.strip().split(" ")
                    )),
                lines
            )
        )
    return lines[int(sent_id)]


In [5]:
path = "/Users/lli/Desktop/code/datasets/penn_treebank/package/treebank_3/parsed/mrg/wsj/"

def get_sentence_parse(id):
    try:
        file_id, sent_id = id.split("_")
        file_path = path + f"{file_id[:2]}/wsj_{file_id}.mrg"
        with open(file_path, mode="r") as f:
            lines = list(map(
                lambda x: "_START_" + x.strip() + " " if x[:3] == "( (" or x[:2] == "((" else x.strip() + " ",
                f.readlines()
                ))
        lines = "".join(lines).split("_START_")
        lines = list(map(
            lambda x: x[1:-2],
            filter(lambda x: x.strip() != "", lines))
            )
        parsed = lines[int(sent_id)]
        tree = nltk.tree.ParentedTree.fromstring(parsed)
        return tree
    except:
        print(id)
        return None


In [6]:
modified_pb = pd.read_csv("decomp_pb_modified_sentences.csv")[["Sentence.ID", "Arg.Pos", "Arg.Phrase", "Arg.Stripped"]].set_index(["Sentence.ID", "Arg.Pos"])

modified_pb


Unnamed: 0_level_0,Unnamed: 1_level_0,Arg.Phrase,Arg.Stripped
Sentence.ID,Arg.Pos,Unnamed: 2_level_1,Unnamed: 3_level_1
0003_21,3:1,the Environmental Protection Agency,the Environmental Protection Agency
0003_21,8:1,a gradual ban,a ban
0003_21,11:1,on virtually all uses of asbestos,on all uses
0003_25,0:1,Workers,Workers
0003_25,2:2,large burlap sacks of the imported material,burlap sacks
...,...,...,...
2454_31,3:1,themselves,themselves
2454_31,4:1,to be anything else,to be anything
2454_7,13:1,the old man,the man
2454_7,17:1,his fists,the fists


In [7]:
# "In July, a gradual ban was imposed by the Environmental Protection Agency on virtually all uses of asbestos"
# "In July, a gradual ban was imposed on virtually all uses of asbestos by the Environmental Protection Agency"

# TODO: experiment with moving both the PP with the object and not

def swap_subj_obj(tree, positions):
    subtrees = []
    pointer_treepos = []

    for position in positions:
        pointer = PropbankTreePointer(*map(int, position.split(":")))
        tp = pointer.treepos(tree)
        subtree = pointer.select(tree)

        pointer_treepos.append(tp)
        assert tree[tp] == subtree
        subtrees.append(subtree.copy())

    # swap subtrees in-place
    subtrees = subtrees[::-1]
    for subtree, tp in zip(subtrees, pointer_treepos):
        tree[tp] = subtree
        
    return subtrees, pointer_treepos


In [8]:
import requests
import bs4


def wiki_is_plural(root):
    url = 'https://en.wiktionary.org/api/rest_v1/page/definition/'
    headers = {
         "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36"
        }
    
    wiki = requests.get(url + root, headers=headers).json()
    if 'en' not in wiki:
        return False
    for result in wiki['en']:
        for definition in result["definitions"]:            
            soup = bs4.BeautifulSoup(definition["definition"], parse_only=bs4.SoupStrainer('a'))
            for link in soup.find_all('a', href=True):
                if link['href'] == "/wiki/Appendix:Glossary#plural_number":
                    return True
    return False


def is_plural(subtree, spacy_root):
    # no conjunctions
    for child in subtree:
        if isinstance(child, nltk.tree.ParentedTree) and child.label() == "CC":
            return True

    root = spacy_root.text
    # get index in subtree
    tp = subtree.leaf_treeposition(subtree.leaves().index(root))
    if subtree[tp[:-1]].label() in ["NNS", "NNPS"]:
        return True
    if subtree[tp[:-1]].label() in ["NN", "NNP"]:
        return False

    # make request to wikitionary API
    if wiki_is_plural(root):
        return True
            
    return False


In [9]:

prn_inflect = {
    "i": "me",
    "me": "I",
    "he": "him",
    "him": "he",
    "she": "her",
    "her": "she",
    "we": "us",
    "us": "we",
    "they": "them",
    "them": "they"
}

anaphors = {
    "itself", "themself", "themselves", "herself", "himself", "myself", "yourself"
}

def fix_pronouns(subtrees, spacy_roots):
    # replace prps in the subtree
    for subtree, root in zip(subtrees, spacy_roots):
        root = root.text
        if root.lower() in prn_inflect:         
            idx = subtree.leaves().index(root)
            tp = subtree.leaf_treeposition(idx)
            subtree[tp] = prn_inflect[root.lower()]


def fix_cases(tree, subj, obj):
    for subtree in [subj, obj]:
        tp = subtree.leaf_treeposition(0)
        arg_first_word = subtree[tp]
        
        # check if subj is first word in sentence
        if subtree == subj and arg_first_word == tree.leaves()[0]:
            # make upper case
            arg_first_word = arg_first_word[0].upper() + arg_first_word[1:]
            subtree[tp] = arg_first_word

        # else only capitalize proper nouns
        elif subtree[tp[:-1]].label() not in ["NNP", "NNPS"]:
            # make lower lower case
            arg_first_word = arg_first_word[0].lower() + arg_first_word[1:]
            subtree[tp] = arg_first_word


In [10]:
# imposed -> was imposed by
# imposes -> is imposed by
# will impose -> will be imposed by

# Future tense verbs have already been removed from the dataset

def in_conjunction(subtree):
    for child in subtree:
        if isinstance(child, nltk.tree.ParentedTree) and child.label() == "CC":
            return True

    if isinstance(subtree, nltk.tree.ParentedTree) and subtree.parent():
        return in_conjunction(subtree.parent())
    return False


def make_verb_passive(tree, vpos, new_subj, subj_root):
    verb = tree[vpos]
    
    tenses = pattern.en.tenses(verb)
    lemma = lemminflect.getLemma(verb, upos="VERB")[0]
    past_participle = lemminflect.getInflection(lemma, tag='VBN')[0]
    plural = is_plural(new_subj, subj_root)

    # move the verb's particle, if any
    lp = tree.leaves().index(verb)
    if lp + 1 < len(tree.leaves()):
        right_pos =  tree.leaf_treeposition(lp + 1)
        right_sibling = tree[right_pos[:-1]]
        if right_sibling.label() == "PRT":
            past_participle += " " + " ".join(right_sibling.leaves())
            # remove the sibling
            tree[right_pos] = ''
    
    # move any adverbs
    # "[adv] were [past participle] by" -> "were [adv] [past participle] by"
    if lp > 0:
        left_pos =  tree.leaf_treeposition(lp - 1)
        left_sibling = tree[left_pos[:-1]]
        if left_sibling.label() == "RB" and "ADVP" in left_sibling.parent().label():
            past_participle = " ".join(left_sibling.leaves()) + " " + past_participle
            # remove the sibling
            tree[left_pos] = ''

    if pattern.en.PAST in tenses:
        if plural:
            new_verb = f"were {past_participle} by"
        else:
            new_verb = f"was {past_participle} by"
    else:
        if plural:
            new_verb = f"are {past_participle} by"
        else:
            new_verb = f"is {past_participle} by"
    tree[vpos] = new_verb


def make_passive(row, inplace=False, return_args=True):
    if inplace:
        tree = row[("tree", "")]
    else:
        tree = row[("tree", "")].copy(deep=True)
    vpos = tree.leaf_treeposition(row[("Pred.Token", "")])

    # cannot passify when there is more than one verb
    if in_conjunction(tree[vpos[:-1]]):
        return np.nan

    # former obj becomes new subj
    (new_subj, new_obj), arg_treepos = swap_subj_obj(
        tree, [row[("Arg.Pos", "subj")][0], row[("Arg.Pos", "obj")][0]])
    
    # use spacy dependency parsing to get root of each arg
    subj_root = spacy_nlp(" ".join(new_subj.leaves()))[0].sent.root
    obj_root = spacy_nlp(" ".join(new_obj.leaves()))[0].sent.root

    # remove sentences with unbound anaphors
    if subj_root.text in anaphors:
        return np.nan
    
    # correct arguments and verb in place
    make_verb_passive(tree, vpos, new_subj, subj_root)
    fix_pronouns([new_subj, new_obj], [subj_root, obj_root])
    fix_cases(tree, new_subj, new_obj)

    if return_args:
        return (
            tree, 
            vpos,
            arg_treepos[0],
            arg_treepos[1]
            )
    else:
        return tree


In [15]:
roles_wide = roles_pb.pivot_table(index=["Sentence.ID",  "Roleset", "Arg", "Pred.Token", "Arg.Pos", "Gram.Func", "Split"], columns="Property", values="Response").dropna()
roles_wide.reset_index(inplace=True)

sentences = roles_wide.pivot_table(index=["Sentence.ID", "Roleset", "Pred.Token"], columns="Gram.Func", values=["Arg.Pos", "Split", "Arg"], aggfunc=list).reset_index()
sentences


Property,Sentence.ID,Roleset,Pred.Token,Arg,Arg,Arg,Arg.Pos,Arg.Pos,Arg.Pos,Split,Split,Split
Gram.Func,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,obj,other,subj,obj,other,subj,obj,other,subj
0,0003_21,impose.01,7,[1],[2],[0],[8:1],[11:1],[3:1],[test],[train],[train]
1,0003_25,dump.01,1,[1],,[0],[2:2],,[0:1],[train],,[train]
2,0003_25,mix.01,22,[1],,[0],[23:1],,[0:1],[train],,[train]
3,0003_25,pour.01,14,[1],,[0],[16:1],,[0:1],[train],,[train]
4,0003_29,have.03,1,[1],,[0],[2:2],,[0:1],[train],,[test]
...,...,...,...,...,...,...,...,...,...,...,...,...
5196,2453_6,succumb.01,3,,[1],[0],,[4:1],[1:1],,[train],[train]
5197,2454_15,shake.01,12,[1],,[0],[13:1],,[0:2],[train],,[test]
5198,2454_18,release.01,4,[1],,[0],[5:1],,[0:1],[train],,[test]
5199,2454_31,consider.01,2,,"[1, 2]",,,"[3:1, 4:1]",,,"[train, train]",


In [16]:
# can only passify sentences with both subject AND objects

# remove sentences containing multiple verbs or no object
sentences = sentences.dropna(
    subset=[('Arg.Pos','obj'), ('Arg.Pos','subj')]).drop_duplicates(
        subset=[("Sentence.ID", "")], keep=False)

# remove sentences with more than one object
sentences[('Arg.Pos','obj')] = sentences[('Arg.Pos','obj')].apply(
    lambda x: x if len(x) == 1 else np.nan)
sentences = sentences.dropna(subset=[('Arg.Pos','obj')])

sentences["tree"] = sentences["Sentence.ID"].apply(get_sentence_parse)


In [18]:
# need to run this cell twice for some reason

sentences[
    ["passive tree", "verb_pos", "subj_pos", "obj_pos"]
    ] = sentences.apply(
    make_passive,
    axis=1,
    result_type="expand")
sentences = sentences.dropna(subset=[("passive tree", '')]).reset_index(drop=True)


  soup = bs4.BeautifulSoup(definition["definition"], parse_only=bs4.SoupStrainer('a'))


In [19]:
sentences.rename(columns={"tree": "active tree"}, inplace=True)
sentences.drop(columns=['Arg.Pos', 'Pred.Token'], inplace=True)

sentences["active sentence"] = sentences["active tree"].apply(
    lambda x: " ".join(x.leaves())
)
sentences["passive sentence"] = sentences["passive tree"].apply(
    lambda x: " ".join(x.leaves())
)

sentences["active verb"] = sentences.apply(
    lambda x: x[("active tree", '')][x[("verb_pos", '')]],
    axis=1
)
sentences["active subj"] = sentences.apply(
    lambda x: " ".join(x[("active tree", '')][x[("subj_pos", '')]].leaves()),
    axis=1
)
sentences["active obj"] = sentences.apply(
    lambda x: " ".join(x[("active tree", '')][x[("obj_pos", '')]].leaves()),
    axis=1
)

sentences["passive verb"] = sentences.apply(
    lambda x: x[("passive tree", '')][x[("verb_pos", '')]],
    axis=1
)
sentences["passive subj"] = sentences.apply(
    lambda x: " ".join(x[("passive tree", '')][x[("subj_pos", '')]].leaves()),
    axis=1
)
sentences["passive obj"] = sentences.apply(
    lambda x: " ".join(x[("passive tree", '')][x[("obj_pos", '')]].leaves()),
    axis=1
)

sentences


  sentences.drop(columns=['Arg.Pos', 'Pred.Token'], inplace=True)


Property,Sentence.ID,Roleset,Arg,Arg,Arg,Split,Split,Split,active tree,passive tree,...,subj_pos,obj_pos,active sentence,passive sentence,active verb,active subj,active obj,passive verb,passive subj,passive obj
Gram.Func,Unnamed: 1_level_1,Unnamed: 2_level_1,obj,other,subj,obj,other,subj,Unnamed: 9_level_1,Unnamed: 10_level_1,...,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0003_21,impose.01,[1],[2],[0],[test],[train],[train],"[[[In], [(NNP July)]], [,], [[the], [Environme...","[[[In], [(NNP July)]], [,], [[a], [gradual], [...",...,"(2,)","(3, 1)","In July , the Environmental Protection Agency ...","In July , a gradual ban was imposed by the Env...",imposed,the Environmental Protection Agency,a gradual ban,was imposed by,a gradual ban,the Environmental Protection Agency
1,0003_29,have.03,[1],,[0],[train],,[test],"[[[It]], [[has], [(NP (DT no) (NN bearing)), (...","[[[(DT No), (NN bearing)], [(IN on), (NP (NP (...",...,"(0,)","(1, 1)",It has no bearing on our work force today .,No bearing on our work force today is had by it .,has,It,no bearing on our work force today,is had by,No bearing on our work force today,it
2,0003_9,lead.02,[1],,[0],[train],,[train],"[[[Dr.], [Talcott]], [[led], [(NP (DT a) (NN t...","[[[(DT A), (NN team)], [(IN of), (NP\n (NP (N...",...,"(0,)","(1, 1)",Dr. Talcott led a team of researchers from the...,A team of researchers from the National Cancer...,led,Dr. Talcott,a team of researchers from the National Cancer...,was led by,A team of researchers from the National Cancer...,Dr. Talcott
3,0004_11,beat.03,[1],,[0],[train],,[test],"[[[Typically]], [,], [[money-fund], [yields]],...","[[[Typically]], [,], [[comparable], [short-ter...",...,"(2,)","(3, 1)","Typically , money-fund yields beat comparable ...","Typically , comparable short-term investments ...",beat,money-fund yields,comparable short-term investments,were beaten by,comparable short-term investments,money-fund yields
4,0008_0,suspend.01,[1],,[0],[train],,[dev],"[[[The], [federal], [government]], [[suspended...","[[[(NNS Sales)], [(IN of), (NP (NNP U.S.) (NNS...",...,"(0,)","(1, 1)",The federal government suspended sales of U.S....,Sales of U.S. savings bonds were suspended by ...,suspended,The federal government,sales of U.S. savings bonds,were suspended by,Sales of U.S. savings bonds,the federal government
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1900,2452_9,deny.01,[1],,[0],[train],,[train],"[[[(DT The), (NN company)], [and], [(PRP$ its)...","[[[The], [charges]], [[are denied by], [(NP (D...",...,"(0,)","(1, 1)",The company and its executives deny the charges .,The charges are denied by the company and its ...,deny,The company and its executives,the charges,are denied by,The charges,the company and its executives
1901,2453_0,report.01,[1],,[0],[train],,[train],"[[[Intel], [Corp.]], [[reported], [(NP (DT a) ...","[[[(DT A), (CD 50), (NN %), (NN drop)], [(IN i...",...,"(0,)","(1, 1)",Intel Corp. reported a 50 % drop in third-quar...,A 50 % drop in third-quarter net income was re...,reported,Intel Corp.,a 50 % drop in third-quarter net income,was reported by,A 50 % drop in third-quarter net income,Intel Corp.
1902,2454_15,shake.01,[1],,[0],[train],,[test],"[[[(VBG Deafening), (NNS chants)], [(IN of), (...","[[[The], [church]], [[was shaken by], [(NP (VB...",...,"(0,)","(1, 1)",Deafening chants of `` ANC '' and `` Umkhonto ...,The church was shaken by deafening chants of `...,shook,Deafening chants of `` ANC '' and `` Umkhonto ...,the church,was shaken by,The church,deafening chants of `` ANC '' and `` Umkhonto ...
1903,2454_18,release.01,[1],,[0],[train],,[test],"[[[President], [F.W.], [de], [Klerk]], [[relea...","[[[The], [ANC], [men]], [[were released by], [...",...,"(0,)","(1, 1)",President F.W. de Klerk released the ANC men -...,The ANC men were released by President F.W. de...,released,President F.W. de Klerk,the ANC men,were released by,The ANC men,President F.W. de Klerk


In [20]:
def treepos_to_index(tree, argpos, arg, sentence, pos="subj"):
    leaves = tree.leaves()
    arg_leaves = arg.split(" ")

    # predicate always has token length of 1 in the tree, even when passive
    if pos == "verb":
        arg_leaves = [arg]
        arg_tree_length = 1
    else:
        arg_tree_length = len(arg_leaves)

    matches = []
    potential_matches = []
    for i in range(len(leaves) - arg_tree_length + 1):
        if leaves[i : i + arg_tree_length] == arg_leaves:
            potential_matches.append((i, i + arg_tree_length))
            treepos = tree.leaf_treeposition(i)
            if treepos[:len(argpos)] == argpos:
                matches.append((i, i + arg_tree_length))
    assert len(matches) == 1, (argpos, leaves, arg_leaves, potential_matches)

    if pos == "obj":
        # after the predicate, so we add 2 to index b/c the verb was added to
        diff = len(sentence.split(" ")) - len(leaves)
        match = (matches[0][0] + diff, matches[0][1] + diff)
    elif pos == "verb":
        # add 2 to the end of the matched sequence
        diff = len(sentence.split(" ")) - len(leaves)
        match = (matches[0][0], matches[0][1] + diff)
        arg_leaves = arg.split(" ")
    else:
        match = matches[0]
    
    if sentence.split(" ")[match[0]: match[1]] != arg_leaves:
        print()
        print(match[0], leaves)
        print(sentence.split(" ")[match[0]: match[1]], arg_leaves)
        return np.nan
    
    return match


In [21]:
sentences["passive subj idx"] = sentences.apply(
    lambda x: treepos_to_index(
        x[("passive tree", "")], x[("subj_pos", "")], x[("passive subj", "")],
        x[("passive sentence", "")], pos="subj"),
    axis=1
)
sentences["passive obj idx"] = sentences.apply(
    lambda x: treepos_to_index(
        x[("passive tree", "")], x[("obj_pos", "")], x[("passive obj", "")],
        x[("passive sentence", "")], pos="obj"),
    axis=1
)
sentences["passive verb idx"] = sentences.apply(
    lambda x: treepos_to_index(
        x[("passive tree", "")], x[("verb_pos", "")], x[("passive verb", "")],
        x[("passive sentence", "")], pos="verb"),
    axis=1
)
sentences.dropna(subset=[("passive obj idx", ""), ("passive subj idx", "")], inplace=True)



27 ['is Shaved by', 'away', 'a', 'decline', 'in', 'U.S.', 'oil', 'production', 'to', '502,000', 'barrels', 'of', 'oil', 'a', 'day', 'during', 'the', 'quarter', 'from', '527,000', 'barrels', 'a', 'day', 'last', 'year', 'was', '*T*-1', 'some', 'of', 'the', 'gain', 'in', 'that', 'unit', '.']
['was', '*T*-1', 'some', 'of', 'the', 'gain', 'in'] ['some', 'of', 'the', 'gain', 'in', 'that', 'unit']

7 ['is Led by', 'United', 'Egg', 'Producers', 'has', 'been', '*T*-1', 'the', 'assault', 'against', 'the', 'Egg', 'King', '.']
['been', '*T*-1', 'the', 'assault', 'against', 'the'] ['the', 'assault', 'against', 'the', 'Egg', 'King']


In [22]:
old_sentences = sentences.copy(deep=True)


In [23]:
sentences.columns


MultiIndex([(     'Sentence.ID',      ''),
            (         'Roleset',      ''),
            (             'Arg',   'obj'),
            (             'Arg', 'other'),
            (             'Arg',  'subj'),
            (           'Split',   'obj'),
            (           'Split', 'other'),
            (           'Split',  'subj'),
            (     'active tree',      ''),
            (    'passive tree',      ''),
            (        'verb_pos',      ''),
            (        'subj_pos',      ''),
            (         'obj_pos',      ''),
            ( 'active sentence',      ''),
            ('passive sentence',      ''),
            (     'active verb',      ''),
            (     'active subj',      ''),
            (      'active obj',      ''),
            (    'passive verb',      ''),
            (    'passive subj',      ''),
            (     'passive obj',      ''),
            ('passive subj idx',      ''),
            ( 'passive obj idx',      ''),
           

In [38]:
sentences = old_sentences.copy(deep=True)


In [39]:
sentences.drop(columns=[("Split", "other"), 'active tree',
                         'passive tree', 'verb_pos', 'subj_pos', 'obj_pos', ("Arg", "other")
                         ], inplace=True)
sentences[("Split", "obj")] = sentences[("Split", "obj")].apply(lambda x: x[0])
sentences[("Split", "subj")] = sentences[("Split", "subj")].apply(lambda x: x[0])
sentences[("obj arg")] = sentences[("Arg", "obj")].apply(lambda x: x[0])
sentences[("subj arg")] = sentences[("Arg", "subj")].apply(lambda x: x[0])

sentences = sentences.melt(value_vars=[
    ("Split", "obj"), ("Split", "subj")
    ],
    id_vars=[
        (     'Sentence.ID',     ''),
        (         'Roleset',     ''),
        ( 'active sentence',     ''),
        ('passive sentence',     ''),
        (     'active verb',     ''),
        (     'active subj',     ''),
        (      'active obj',     ''),
        (    'passive verb',     ''),
        (    'passive subj',     ''),
        (     'passive obj',     ''),
        ('passive subj idx',     ''),
        ( 'passive obj idx',     ''),
        ('passive verb idx',     ''),
        ("subj arg", ""), ("obj arg", "")
    ])

sentences.columns = map(lambda x: x[0] if not isinstance(x, str) else x, sentences.columns)
sentences['active'] = sentences.apply(
    lambda x: x["active " + x['Gram.Func']], axis=1
)
sentences['passive'] = sentences.apply(
    lambda x: x["passive subj"] if x['Gram.Func'] == 'obj' else x["passive obj"], axis=1
)

sentences = sentences.drop(
    columns=['passive subj', 'passive obj', 'active subj', 'active obj']
    ).sort_values(by="Sentence.ID").reset_index(drop=True
    ).set_index(["Sentence.ID", "Roleset", "Gram.Func"])
sentences


  sentences.drop(columns=[("Split", "other"), 'active tree',


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,active sentence,passive sentence,active verb,passive verb,passive subj idx,passive obj idx,passive verb idx,subj arg,obj arg,Property,value,active,passive
Sentence.ID,Roleset,Gram.Func,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0003_21,impose.01,obj,"In July , the Environmental Protection Agency ...","In July , a gradual ban was imposed by the Env...",imposed,was imposed by,"(3, 6)","(9, 13)","(6, 9)",0,1,Split,test,a gradual ban,a gradual ban
0003_21,impose.01,subj,"In July , the Environmental Protection Agency ...","In July , a gradual ban was imposed by the Env...",imposed,was imposed by,"(3, 6)","(9, 13)","(6, 9)",0,1,Split,train,the Environmental Protection Agency,the Environmental Protection Agency
0003_29,have.03,obj,It has no bearing on our work force today .,No bearing on our work force today is had by it .,has,is had by,"(0, 7)","(10, 11)","(7, 10)",0,1,Split,train,no bearing on our work force today,No bearing on our work force today
0003_29,have.03,subj,It has no bearing on our work force today .,No bearing on our work force today is had by it .,has,is had by,"(0, 7)","(10, 11)","(7, 10)",0,1,Split,test,It,it
0003_9,lead.02,obj,Dr. Talcott led a team of researchers from the...,A team of researchers from the National Cancer...,led,was led by,"(0, 19)","(22, 24)","(19, 22)",0,1,Split,train,a team of researchers from the National Cancer...,A team of researchers from the National Cancer...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2454_15,shake.01,obj,Deafening chants of `` ANC '' and `` Umkhonto ...,The church was shaken by deafening chants of `...,shook,was shaken by,"(0, 2)","(5, 17)","(2, 5)",0,1,Split,train,the church,The church
2454_18,release.01,subj,President F.W. de Klerk released the ANC men -...,The ANC men were released by President F.W. de...,released,were released by,"(0, 3)","(6, 10)","(3, 6)",0,1,Split,test,President F.W. de Klerk,President F.W. de Klerk
2454_18,release.01,obj,President F.W. de Klerk released the ANC men -...,The ANC men were released by President F.W. de...,released,were released by,"(0, 3)","(6, 10)","(3, 6)",0,1,Split,train,the ANC men,The ANC men
2454_7,shoot.06,obj,"As the crowd outside his home shouted `` ANC ,...","As the crowd outside his home shouted `` ANC ,...",shot,were shot by,"(13, 15)","(18, 21)","(15, 18)",0,1,Split,train,his fists,his fists


In [43]:
judgements = pd.read_csv("decomp_pb_masked_arguments.csv")[[
    'Sentence.ID', 'Roleset', 'Gram.Func', 'awareness', 'change_of_location',
    'change_of_state', 'changes_possession', 'created', 'destroyed',
    'existed_after', 'existed_before', 'existed_during',
    'exists_as_physical', 'instigation', 'location_of_event',
    'makes_physical_contact', 'manipulated_by_another',
    'predicate_changed_argument', 'sentient', 'stationary', 'volition'
]]
proto_roles_names = judgements.columns[3:]
print(proto_roles_names)

judgements.set_index(["Sentence.ID", "Roleset", "Gram.Func"], inplace=True)
judgements


Index(['awareness', 'change_of_location', 'change_of_state',
       'changes_possession', 'created', 'destroyed', 'existed_after',
       'existed_before', 'existed_during', 'exists_as_physical', 'instigation',
       'location_of_event', 'makes_physical_contact', 'manipulated_by_another',
       'predicate_changed_argument', 'sentient', 'stationary', 'volition'],
      dtype='object')


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,awareness,change_of_location,change_of_state,changes_possession,created,destroyed,existed_after,existed_before,existed_during,exists_as_physical,instigation,location_of_event,makes_physical_contact,manipulated_by_another,predicate_changed_argument,sentient,stationary,volition
Sentence.ID,Roleset,Gram.Func,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0003_21,impose.01,other,1.0,1.0,4.0,1.0,1.0,1.0,5.0,5.0,5.0,1.0,1.0,1.0,1.0,1.0,5.0,1.0,1.0,1.0
0003_21,impose.01,subj,5.0,1.0,3.0,1.0,1.0,1.0,5.0,5.0,5.0,1.0,5.0,1.0,1.0,1.0,3.0,1.0,1.0,5.0
0003_21,impose.01,obj,1.0,1.0,3.0,1.0,5.0,1.0,5.0,1.0,5.0,1.0,1.0,1.0,1.0,5.0,5.0,1.0,1.0,1.0
0003_25,dump.01,subj,5.0,3.0,3.0,1.0,1.0,1.0,5.0,5.0,5.0,5.0,5.0,1.0,5.0,1.0,1.0,5.0,3.0,5.0
0003_25,dump.01,obj,1.0,5.0,5.0,3.0,1.0,5.0,1.0,5.0,5.0,5.0,1.0,1.0,5.0,5.0,5.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2454_31,consider.01,other,5.0,3.0,3.0,1.0,1.0,1.0,5.0,5.0,5.0,5.0,5.0,1.0,3.0,1.0,3.0,5.0,3.0,5.0
2454_31,consider.01,other,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,5.0,1.0,1.0,1.0,1.0
2454_7,shoot.06,subj,5.0,3.0,3.0,1.0,1.0,1.0,5.0,5.0,5.0,5.0,5.0,1.0,3.0,1.0,2.0,5.0,3.0,5.0
2454_7,shoot.06,obj,1.0,5.0,5.0,1.0,1.0,1.0,5.0,5.0,5.0,5.0,1.0,1.0,2.0,5.0,5.0,1.0,1.0,1.0


In [44]:
sentences = pd.merge(sentences, judgements, left_index=True, right_index=True
                     ).drop(columns=['Property'])
sentences.rename(columns={
    "value": "Split",
    "active": "active arg",
    "passive": "passive arg"
}, inplace=True)

sentences = sentences.reset_index()


In [45]:
sentences.head()


Unnamed: 0,Sentence.ID,Roleset,Gram.Func,active sentence,passive sentence,active verb,passive verb,passive subj idx,passive obj idx,passive verb idx,...,existed_during,exists_as_physical,instigation,location_of_event,makes_physical_contact,manipulated_by_another,predicate_changed_argument,sentient,stationary,volition
0,0003_21,impose.01,obj,"In July , the Environmental Protection Agency ...","In July , a gradual ban was imposed by the Env...",imposed,was imposed by,"(3, 6)","(9, 13)","(6, 9)",...,5.0,1.0,1.0,1.0,1.0,5.0,5.0,1.0,1.0,1.0
1,0003_21,impose.01,subj,"In July , the Environmental Protection Agency ...","In July , a gradual ban was imposed by the Env...",imposed,was imposed by,"(3, 6)","(9, 13)","(6, 9)",...,5.0,1.0,5.0,1.0,1.0,1.0,3.0,1.0,1.0,5.0
2,0003_29,have.03,obj,It has no bearing on our work force today .,No bearing on our work force today is had by it .,has,is had by,"(0, 7)","(10, 11)","(7, 10)",...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,0003_29,have.03,subj,It has no bearing on our work force today .,No bearing on our work force today is had by it .,has,is had by,"(0, 7)","(10, 11)","(7, 10)",...,5.0,3.0,3.0,1.0,3.0,3.0,1.0,2.0,3.0,3.0
4,0003_9,lead.02,obj,Dr. Talcott led a team of researchers from the...,A team of researchers from the National Cancer...,led,was led by,"(0, 19)","(22, 24)","(19, 22)",...,5.0,5.0,3.0,3.0,3.0,5.0,3.0,1.0,1.0,5.0


In [46]:
modified_pb = modified_pb.reset_index().set_index(["Sentence.ID", "Arg.Phrase"])
modified_pb.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,Arg.Pos,Arg.Stripped
Sentence.ID,Arg.Phrase,Unnamed: 2_level_1,Unnamed: 3_level_1
0003_21,the Environmental Protection Agency,3:1,the Environmental Protection Agency
0003_21,a gradual ban,8:1,a ban
0003_21,on virtually all uses of asbestos,11:1,on all uses
0003_25,Workers,0:1,Workers
0003_25,large burlap sacks of the imported material,2:2,burlap sacks


In [47]:
sentences["Arg.Stripped"] = sentences.apply(
    lambda x: modified_pb.loc[x["Sentence.ID"], x["active arg"]]["Arg.Stripped"][0],
    axis=1
)
sentences.head()


  lambda x: modified_pb.loc[x["Sentence.ID"], x["active arg"]]["Arg.Stripped"][0],
  lambda x: modified_pb.loc[x["Sentence.ID"], x["active arg"]]["Arg.Stripped"][0],


Unnamed: 0,Sentence.ID,Roleset,Gram.Func,active sentence,passive sentence,active verb,passive verb,passive subj idx,passive obj idx,passive verb idx,...,exists_as_physical,instigation,location_of_event,makes_physical_contact,manipulated_by_another,predicate_changed_argument,sentient,stationary,volition,Arg.Stripped
0,0003_21,impose.01,obj,"In July , the Environmental Protection Agency ...","In July , a gradual ban was imposed by the Env...",imposed,was imposed by,"(3, 6)","(9, 13)","(6, 9)",...,1.0,1.0,1.0,1.0,5.0,5.0,1.0,1.0,1.0,a ban
1,0003_21,impose.01,subj,"In July , the Environmental Protection Agency ...","In July , a gradual ban was imposed by the Env...",imposed,was imposed by,"(3, 6)","(9, 13)","(6, 9)",...,1.0,5.0,1.0,1.0,1.0,3.0,1.0,1.0,5.0,the Environmental Protection Agency
2,0003_29,have.03,obj,It has no bearing on our work force today .,No bearing on our work force today is had by it .,has,is had by,"(0, 7)","(10, 11)","(7, 10)",...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,no bearing
3,0003_29,have.03,subj,It has no bearing on our work force today .,No bearing on our work force today is had by it .,has,is had by,"(0, 7)","(10, 11)","(7, 10)",...,3.0,3.0,1.0,3.0,3.0,1.0,2.0,3.0,3.0,It
4,0003_9,lead.02,obj,Dr. Talcott led a team of researchers from the...,A team of researchers from the National Cancer...,led,was led by,"(0, 19)","(22, 24)","(19, 22)",...,5.0,3.0,3.0,3.0,5.0,3.0,1.0,1.0,5.0,a team


In [48]:
sentences.to_csv("decomp_passive_with_roles.csv")
