In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus.reader.propbank import PropbankTreePointer


In [2]:
properties_list = ['awareness',
       'change_of_location', 'change_of_state', 'changes_possession',
       'created', 'destroyed', 'existed_after', 'existed_before',
       'existed_during', 'exists_as_physical', 'instigation',
       'location_of_event', 'makes_physical_contact', 'manipulated_by_another',
       'predicate_changed_argument', 'sentient', 'stationary', 'volition']


In [3]:
def treepos_to_index(tree, argpos, arg):
    leaves = tree.leaves()
    arg_leaves = arg.leaves()

    if len(argpos.split(":")) != 2:
        return np.nan
    pointer = PropbankTreePointer(*map(int, argpos.split(":")))
    argpos = pointer.treepos(tree)

    matches = []
    potential_matches = []
    for i in range(len(leaves) - len(arg_leaves) + 1):
        if leaves[i: i+len(arg_leaves)] == arg_leaves:
            potential_matches.append((i, i+len(arg_leaves)))
            treepos = tree.leaf_treeposition(i)
            if treepos[:len(argpos)] == argpos:
                matches.append((i, i+len(arg_leaves)))
    assert len(matches) == 1, (argpos, leaves, arg_leaves, potential_matches)
    match = matches[0]
    return match


In [4]:
active_trees = pd.read_csv("decomp_pb_trees.csv")
active_trees['tree'] = active_trees['tree'].apply(nltk.tree.ParentedTree.fromstring)
active_trees['Arg.Phrase'] = active_trees['Arg.Phrase'].apply(nltk.tree.ParentedTree.fromstring)

active_trees['arg_idx'] = active_trees.apply(
    lambda x: treepos_to_index(
        x["tree"], x["Arg.Pos"], x["Arg.Phrase"]),
    axis=1
)
active_trees['verb_idx'] = active_trees["Pred.Token"].apply(
    lambda x: (x, x+1)
)
print(active_trees.shape)
active_trees.head()


(9738, 29)


Unnamed: 0.1,Unnamed: 0,Sentence.ID,Roleset,Pred.Token,Arg.Pos,Gram.Func,Split,awareness,change_of_location,change_of_state,...,makes_physical_contact,manipulated_by_another,predicate_changed_argument,sentient,stationary,volition,tree,Arg.Phrase,arg_idx,verb_idx
0,0,0003_21,impose.01,7,11:1,other,train,1.0,1.0,4.0,...,1.0,1.0,5.0,1.0,1.0,1.0,"[[[In], [(NNP July)]], [,], [[the], [Environme...","[[on], [[(ADJP (RB virtually) (DT all)), (NNS ...","(11, 17)","(7, 8)"
1,1,0003_21,impose.01,7,3:1,subj,train,5.0,1.0,3.0,...,1.0,1.0,3.0,1.0,1.0,5.0,"[[[In], [(NNP July)]], [,], [[the], [Environme...","[[the], [Environmental], [Protection], [Agency]]","(3, 7)","(7, 8)"
2,2,0003_21,impose.01,7,8:1,obj,test,1.0,1.0,3.0,...,1.0,5.0,5.0,1.0,1.0,1.0,"[[[In], [(NNP July)]], [,], [[the], [Environme...","[[a], [gradual], [ban]]","(8, 11)","(7, 8)"
3,3,0003_25,dump.01,1,0:1,subj,train,5.0,3.0,3.0,...,5.0,1.0,1.0,5.0,3.0,5.0,"[[[Workers]], [[(VBD dumped), (NP\n (NP (JJ l...",[[Workers]],"(0, 1)","(1, 2)"
4,4,0003_25,dump.01,1,2:2,obj,train,1.0,5.0,5.0,...,5.0,5.0,5.0,1.0,1.0,1.0,"[[[Workers]], [[(VBD dumped), (NP\n (NP (JJ l...","[[[large], [burlap], [sacks]], [[of], [(DT the...","(2, 9)","(1, 2)"


In [5]:
roles_wide = pd.read_csv("decomp_pb_modified_sentences.csv", na_filter=False)
roles_wide.drop(columns=["Unnamed: 0", "Arg.Pos", "Sentence.Stripped", "Pred.Token"], inplace=True, axis=1)
roles_wide["structure"] = "active_full"
roles_wide["arg_idx"] = active_trees['arg_idx']
roles_wide["verb_idx"] = active_trees['verb_idx']
roles_wide.dropna(subset="arg_idx", inplace=True)

roles_wide


Unnamed: 0,Sentence.ID,Roleset,Arg,Gram.Func,Split,awareness,change_of_location,change_of_state,changes_possession,created,...,sentient,stationary,volition,Sentence,Arg.Phrase,Arg.Stripped,Predicate,structure,arg_idx,verb_idx
0,0003_21,impose.01,0,subj,train,5.0,1.0,3.0,1.0,1.0,...,1.0,1.0,5.0,"In July , the Environmental Protection Agency ...",the Environmental Protection Agency,the Environmental Protection Agency,imposed,active_full,"(11, 17)","(7, 8)"
1,0003_21,impose.01,1,obj,test,1.0,1.0,3.0,1.0,5.0,...,1.0,1.0,1.0,"In July , the Environmental Protection Agency ...",a gradual ban,a ban,imposed,active_full,"(3, 7)","(7, 8)"
2,0003_21,impose.01,2,other,train,1.0,1.0,4.0,1.0,1.0,...,1.0,1.0,1.0,"In July , the Environmental Protection Agency ...",on virtually all uses of asbestos,on all uses,imposed,active_full,"(8, 11)","(7, 8)"
3,0003_25,dump.01,0,subj,train,5.0,3.0,3.0,1.0,1.0,...,5.0,3.0,5.0,Workers dumped large burlap sacks of the impor...,Workers,Workers,dumped,active_full,"(0, 1)","(1, 2)"
4,0003_25,dump.01,1,obj,train,1.0,5.0,5.0,3.0,1.0,...,1.0,1.0,1.0,Workers dumped large burlap sacks of the impor...,large burlap sacks of the imported material,burlap sacks,dumped,active_full,"(2, 9)","(1, 2)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9733,2454_31,consider.01,1,other,train,5.0,3.0,3.0,1.0,1.0,...,5.0,3.0,5.0,They never considered themselves to be anythin...,themselves,themselves,considered,active_full,"(3, 4)","(2, 3)"
9734,2454_31,consider.01,2,other,train,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,They never considered themselves to be anythin...,to be anything else,to be anything,considered,active_full,"(4, 8)","(2, 3)"
9735,2454_7,shoot.06,0,subj,train,5.0,3.0,3.0,1.0,1.0,...,5.0,3.0,5.0,"As the crowd outside his home shouted `` ANC ,...",the old man,the man,shot,active_full,"(13, 16)","(16, 17)"
9736,2454_7,shoot.06,1,obj,train,1.0,5.0,5.0,1.0,1.0,...,1.0,1.0,1.0,"As the crowd outside his home shouted `` ANC ,...",his fists,the fists,shot,active_full,"(17, 19)","(16, 17)"


In [10]:
roles_wide_passive = pd.read_csv('decomp_passive_with_roles.csv', na_filter=False).drop(columns="Unnamed: 0")

# for sentence, arg in roles_wide_passive[["active sentence", "active arg"]].to_numpy():
#     assert arg in sentence

# for sentence, arg in roles_wide_passive[["passive sentence", "passive arg"]].to_numpy():
#     assert arg in sentence

roles_wide_passive["Arg"] = roles_wide_passive.apply(
    lambda x: x[f"{x['Gram.Func']} arg"],
    axis=1)

# flip the gram of the active sentence to passive sentence
roles_wide_passive["Gram.Func"] = roles_wide_passive["Gram.Func"].apply(
    lambda x: "subj" if x == "obj" else "obj")
roles_wide_passive["arg_idx"] = roles_wide_passive.apply(
    lambda x: x[f"passive {x['Gram.Func']} idx"],
    axis=1)
roles_wide_passive["verb_idx"] = roles_wide_passive.apply(
    lambda x: x[f"passive verb idx"],
    axis=1)

roles_wide_passive["structure"] = "passive_full"

roles_wide_passive = roles_wide_passive.rename(
    columns={"passive sentence":"Sentence", "passive arg":"Arg.Phrase", "passive verb":"Predicate"}
    ).drop(
        columns=["active sentence", "active arg", "active verb",
                 "passive subj idx", "passive obj idx", "passive verb idx", "subj arg", "obj arg"])

roles_wide_passive[properties_list] = roles_wide_passive[properties_list].to_numpy().astype(int)

roles_wide_passive


Unnamed: 0,Sentence.ID,Roleset,Gram.Func,Sentence,Predicate,Split,Arg.Phrase,awareness,change_of_location,change_of_state,...,manipulated_by_another,predicate_changed_argument,sentient,stationary,volition,Arg.Stripped,Arg,arg_idx,verb_idx,structure
0,0003_21,impose.01,subj,"In July , a gradual ban was imposed by the Env...",was imposed by,test,a gradual ban,1,1,3,...,5,5,1,1,1,a ban,1,"(3, 6)","(6, 9)",passive_full
1,0003_21,impose.01,obj,"In July , a gradual ban was imposed by the Env...",was imposed by,train,the Environmental Protection Agency,5,1,3,...,1,3,1,1,5,the Environmental Protection Agency,0,"(9, 13)","(6, 9)",passive_full
2,0003_29,have.03,subj,No bearing on our work force today is had by it .,is had by,train,No bearing on our work force today,1,1,1,...,1,1,1,1,1,no bearing,1,"(0, 7)","(7, 10)",passive_full
3,0003_29,have.03,obj,No bearing on our work force today is had by it .,is had by,test,it,3,3,3,...,3,1,2,3,3,It,0,"(10, 11)","(7, 10)",passive_full
4,0003_9,lead.02,subj,A team of researchers from the National Cancer...,was led by,train,A team of researchers from the National Cancer...,5,3,3,...,5,3,1,1,5,a team,1,"(0, 19)","(19, 22)",passive_full
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3801,2454_15,shake.01,subj,The church was shaken by deafening chants of `...,was shaken by,train,The church,1,1,1,...,5,1,1,5,1,the church,1,"(0, 2)","(2, 5)",passive_full
3802,2454_18,release.01,obj,The ANC men were released by President F.W. de...,were released by,test,President F.W. de Klerk,5,3,3,...,1,2,5,3,5,President F.W. de Klerk,0,"(6, 10)","(3, 6)",passive_full
3803,2454_18,release.01,subj,The ANC men were released by President F.W. de...,were released by,train,The ANC men,5,5,5,...,1,5,5,1,1,the ANC men,1,"(0, 3)","(3, 6)",passive_full
3804,2454_7,shoot.06,subj,"As the crowd outside his home shouted `` ANC ,...",were shot by,train,his fists,1,5,5,...,5,5,1,1,1,the fists,1,"(13, 15)","(15, 18)",passive_full


In [11]:
roles_combined = pd.concat([roles_wide_passive, roles_wide])

roles_combined.to_csv("active_passive_decompV1.csv")
roles_combined


Unnamed: 0,Sentence.ID,Roleset,Gram.Func,Sentence,Predicate,Split,Arg.Phrase,awareness,change_of_location,change_of_state,...,manipulated_by_another,predicate_changed_argument,sentient,stationary,volition,Arg.Stripped,Arg,arg_idx,verb_idx,structure
0,0003_21,impose.01,subj,"In July , a gradual ban was imposed by the Env...",was imposed by,test,a gradual ban,1.0,1.0,3.0,...,5.0,5.0,1.0,1.0,1.0,a ban,1,"(3, 6)","(6, 9)",passive_full
1,0003_21,impose.01,obj,"In July , a gradual ban was imposed by the Env...",was imposed by,train,the Environmental Protection Agency,5.0,1.0,3.0,...,1.0,3.0,1.0,1.0,5.0,the Environmental Protection Agency,0,"(9, 13)","(6, 9)",passive_full
2,0003_29,have.03,subj,No bearing on our work force today is had by it .,is had by,train,No bearing on our work force today,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,no bearing,1,"(0, 7)","(7, 10)",passive_full
3,0003_29,have.03,obj,No bearing on our work force today is had by it .,is had by,test,it,3.0,3.0,3.0,...,3.0,1.0,2.0,3.0,3.0,It,0,"(10, 11)","(7, 10)",passive_full
4,0003_9,lead.02,subj,A team of researchers from the National Cancer...,was led by,train,A team of researchers from the National Cancer...,5.0,3.0,3.0,...,5.0,3.0,1.0,1.0,5.0,a team,1,"(0, 19)","(19, 22)",passive_full
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9733,2454_31,consider.01,other,They never considered themselves to be anythin...,considered,train,themselves,5.0,3.0,3.0,...,1.0,3.0,5.0,3.0,5.0,themselves,1,"(3, 4)","(2, 3)",active_full
9734,2454_31,consider.01,other,They never considered themselves to be anythin...,considered,train,to be anything else,1.0,1.0,1.0,...,5.0,1.0,1.0,1.0,1.0,to be anything,2,"(4, 8)","(2, 3)",active_full
9735,2454_7,shoot.06,subj,"As the crowd outside his home shouted `` ANC ,...",shot,train,the old man,5.0,3.0,3.0,...,1.0,2.0,5.0,3.0,5.0,the man,0,"(13, 16)","(16, 17)",active_full
9736,2454_7,shoot.06,obj,"As the crowd outside his home shouted `` ANC ,...",shot,train,his fists,1.0,5.0,5.0,...,5.0,5.0,1.0,1.0,1.0,the fists,1,"(17, 19)","(16, 17)",active_full
