In [1]:
import pandas as pd
import numpy as np
from io import StringIO
import transformers
import torch
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import f1_score
from scipy.stats import spearmanr


from sys import path
path.append("../")
from decompose.analysis import preprocess_input



In [2]:
roles_wide = pd.read_csv("../../dataV2/combined_V1_V2.csv", na_filter=False)
roles_wide.drop("Unnamed: 0", inplace=True, axis=1)

print(roles_wide.columns)
print(roles_wide.shape)
roles_wide.head()


Index(['Sentence.ID', 'Roleset', 'Gram.Func', 'Sentence', 'Predicate', 'Split',
       'Arg.Phrase', 'Arg.Stripped', 'Arg', 'arg_idx', 'verb_idx', 'structure',
       'decomp_version', 'Pred.Lemma', 'modified_sentence', 'Arg.Tokens.Begin',
       'combined_labels', 'VN_mappings'],
      dtype='object')
(24906, 18)


Unnamed: 0,Sentence.ID,Roleset,Gram.Func,Sentence,Predicate,Split,Arg.Phrase,Arg.Stripped,Arg,arg_idx,verb_idx,structure,decomp_version,Pred.Lemma,modified_sentence,Arg.Tokens.Begin,combined_labels,VN_mappings
0,0003_21,impose.01,nsubj,"In July , a gradual ban was imposed by the Env...",was imposed by,test,a gradual ban,a ban,1,"(3, 6)","(6, 9)",passive_full,V1,impose,1,,"[1.0, 1.0, 3.0, 1.0, 5.0, 1.0, 5.0, 1.0, 1.0, ...",
1,0003_21,impose.01,dobj,"In July , a gradual ban was imposed by the Env...",was imposed by,train,the Environmental Protection Agency,the Environmental Protection Agency,0,"(9, 13)","(6, 9)",passive_full,V1,impose,1,,"[5.0, 1.0, 3.0, 1.0, 5.0, 5.0, 5.0, 5.0, 1.0, ...",
2,0003_29,have.03,nsubj,No bearing on our work force today is had by it .,is had by,train,No bearing on our work force today,no bearing,1,"(0, 7)","(7, 10)",passive_full,V1,have,1,,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",
3,0003_29,have.03,dobj,No bearing on our work force today is had by it .,is had by,test,it,It,0,"(10, 11)","(7, 10)",passive_full,V1,have,1,,"[3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 5.0, 3.0, 2.0, ...",
4,0003_9,lead.02,nsubj,A team of researchers from the National Cancer...,was led by,train,A team of researchers from the National Cancer...,a team,1,"(0, 19)","(19, 22)",passive_full,V1,lead,1,,"[5.0, 3.0, 3.0, 1.0, 3.0, 3.0, 5.0, 3.0, 1.0, ...",


In [3]:
roles_wide.rename({"Sentence": "sentence", "Arg.Phrase": "arg"}, axis=1, inplace=True)

properties_list = ['awareness', 'change_of_location', 'change_of_state',
       'change_of_possession', 'existed_after', 'existed_before',
       'existed_during', 'instigation', 'sentient', 'volition']

def make_labels_onehot(x):
    x = np.array(x)
    labels = np.zeros((3, len(properties_list)))
    # 1 or 2 (negative)
    labels[0, :] = (x <= 2)
    labels[1, :] = np.abs(x - 3) < 1
    labels[2, :] = (x >= 4)
    # print(labels.flatten().reshape(3, -1))
    return labels.flatten()

def make_labels(x):
    x = np.array(x)
    labels = np.ones(len(properties_list))
    labels[np.where(x <= 2)] = 0
    labels[np.where(x >= 4)] = 2
    return labels

roles_wide["combined_labels"] = roles_wide["combined_labels"].apply(lambda x: eval(x))
roles_wide['labels_onehot'] = roles_wide["combined_labels"].apply(make_labels_onehot)
roles_wide['labels'] = roles_wide["combined_labels"].apply(make_labels)

roles_wide.head()


Unnamed: 0,Sentence.ID,Roleset,Gram.Func,sentence,Predicate,Split,arg,Arg.Stripped,Arg,arg_idx,verb_idx,structure,decomp_version,Pred.Lemma,modified_sentence,Arg.Tokens.Begin,combined_labels,VN_mappings,labels_onehot,labels
0,0003_21,impose.01,nsubj,"In July , a gradual ban was imposed by the Env...",was imposed by,test,a gradual ban,a ban,1,"(3, 6)","(6, 9)",passive_full,V1,impose,1,,"[1.0, 1.0, 3.0, 1.0, 5.0, 1.0, 5.0, 1.0, 1.0, ...",,"[1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, ...","[0.0, 0.0, 1.0, 0.0, 2.0, 0.0, 2.0, 0.0, 0.0, ..."
1,0003_21,impose.01,dobj,"In July , a gradual ban was imposed by the Env...",was imposed by,train,the Environmental Protection Agency,the Environmental Protection Agency,0,"(9, 13)","(6, 9)",passive_full,V1,impose,1,,"[5.0, 1.0, 3.0, 1.0, 5.0, 5.0, 5.0, 5.0, 1.0, ...",,"[0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","[2.0, 0.0, 1.0, 0.0, 2.0, 2.0, 2.0, 2.0, 0.0, ..."
2,0003_29,have.03,nsubj,No bearing on our work force today is had by it .,is had by,train,No bearing on our work force today,no bearing,1,"(0, 7)","(7, 10)",passive_full,V1,have,1,,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,0003_29,have.03,dobj,No bearing on our work force today is had by it .,is had by,test,it,It,0,"(10, 11)","(7, 10)",passive_full,V1,have,1,,"[3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 5.0, 3.0, 2.0, ...",,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 0.0, ..."
4,0003_9,lead.02,nsubj,A team of researchers from the National Cancer...,was led by,train,A team of researchers from the National Cancer...,a team,1,"(0, 19)","(19, 22)",passive_full,V1,lead,1,,"[5.0, 3.0, 3.0, 1.0, 3.0, 3.0, 5.0, 3.0, 1.0, ...",,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","[2.0, 1.0, 1.0, 0.0, 1.0, 1.0, 2.0, 1.0, 0.0, ..."


### Inter-annotator agreement

In [4]:
per_annotator_df = roles_wide.loc[
    (roles_wide["decomp_version"] == "V1") & \
    (roles_wide["Split"] == "test")
    # (roles_wide["modified_sentence"] == 0)
    ][["Sentence.ID", "Pred.Lemma", "Gram.Func", "arg", "combined_labels", "labels", "sentence", "Predicate", "arg_idx", "verb_idx", "structure", "modified_sentence"]].pivot_table(
    index=["Sentence.ID", "Pred.Lemma", "Gram.Func", "arg", "sentence", "Predicate", "arg_idx", "verb_idx",  "structure", "modified_sentence"], values=["combined_labels", "labels"], aggfunc=np.stack).reset_index()

print(per_annotator_df.shape)


(1342, 12)


### Get model inference

In [5]:
roberta_path = "../combined_SPRL_models/roberta-large_dropout=0.1"
roberta_model = transformers.AutoModelForSequenceClassification.from_pretrained(roberta_path)
roberta_tokenizer = transformers.AutoTokenizer.from_pretrained(roberta_path + "/tokenizer")

gpt_path = "../combined_SPRL_models/gpt2-medium"
gpt_model = transformers.AutoModelForSequenceClassification.from_pretrained(gpt_path)
gpt_tokenizer = transformers.AutoTokenizer.from_pretrained(gpt_path + "/tokenizer")

bert_path = "../combined_SPRL_models/bert-large-cased"
bert_model = transformers.AutoModelForSequenceClassification.from_pretrained(bert_path)
bert_tokenizer = transformers.AutoTokenizer.from_pretrained(bert_path + "/tokenizer")


In [6]:
inputs = per_annotator_df.reset_index()[["sentence", "arg", "arg_idx", "Predicate", "verb_idx", "structure", "modified_sentence"]].to_numpy()
inputs[:5]


array([['In July , the Environmental Protection Agency imposed a gradual ban on virtually all uses of asbestos .',
        'a gradual ban', '(3, 7)', 'imposed', '(7, 8)', 'active_full', 0],
       ['In July , a gradual ban was imposed by the Environmental Protection Agency on virtually all uses of asbestos .',
        'a gradual ban', '(3, 6)', 'was imposed by', '(6, 9)',
        'passive_full', 1],
       ['No bearing on our work force today is had by it .', 'it',
        '(10, 11)', 'is had by', '(7, 10)', 'passive_full', 1],
       ['It has no bearing on our work force today .', 'It', '(0, 1)',
        'has', '(1, 2)', 'active_full', 0],
       ['Typically , comparable short-term investments were beaten by money-fund yields because portfolio managers can vary maturities and go after the highest rates .',
        'money-fund yields', '(8, 10)', 'were beaten by', '(5, 8)',
        'passive_full', 1]], dtype=object)

In [7]:
test = []
test_truncated = []

for i, (sentence, arg, arg_idx, verb, verb_idx, structure, modified) in enumerate(inputs):
    try:
        arg_idx = eval(arg_idx)
        verb_idx = eval(verb_idx)
        sentence, _ = preprocess_input.format_input(sentence, arg_idx, verb_idx)
        test.append({"sentence": sentence,
                     "index": i, 
                     "structure": structure.split("_")[0],
                     "modified": modified})
        sentence_truncated = "<a>".join(sentence.split("<a>")[:-1]) + "<a>"
        test_truncated.append({"sentence": sentence_truncated,
                     "index": i, 
                     "structure": structure.split("_")[0],
                     "modified": modified})
    except ValueError as e:
        print(e)

print(len(test))


1342


In [8]:
test[:5]


[{'sentence': 'In July,<a> the Environmental Protection Agency<a><p> imposed<p> a gradual ban on virtually all uses of asbestos.',
  'index': 0,
  'structure': 'active',
  'modified': 0},
 {'sentence': 'In July,<a> a gradual ban<a><p> was imposed by<p> the Environmental Protection Agency on virtually all uses of asbestos.',
  'index': 1,
  'structure': 'passive',
  'modified': 1},
 {'sentence': 'No bearing on our work force today<p> is had by<p><a> it<a>.',
  'index': 2,
  'structure': 'passive',
  'modified': 1},
 {'sentence': '<a>It<a><p> has<p> no bearing on our work force today.',
  'index': 3,
  'structure': 'active',
  'modified': 0},
 {'sentence': 'Typically, comparable short-term investments<p> were beaten by<p><a> money-fund yields<a> because portfolio managers can vary maturities and go after the highest rates.',
  'index': 4,
  'structure': 'passive',
  'modified': 1}]

In [9]:
test_truncated[:5]


[{'sentence': 'In July,<a> the Environmental Protection Agency<a>',
  'index': 0,
  'structure': 'active',
  'modified': 0},
 {'sentence': 'In July,<a> a gradual ban<a>',
  'index': 1,
  'structure': 'passive',
  'modified': 1},
 {'sentence': 'No bearing on our work force today<p> is had by<p><a> it<a>',
  'index': 2,
  'structure': 'passive',
  'modified': 1},
 {'sentence': '<a>It<a>', 'index': 3, 'structure': 'active', 'modified': 0},
 {'sentence': 'Typically, comparable short-term investments<p> were beaten by<p><a> money-fund yields<a>',
  'index': 4,
  'structure': 'passive',
  'modified': 1}]

## Roberta

In [10]:
# roberta_pred = {}

# with torch.no_grad():
#     for t in test:
#         i = t["index"]
#         tokens = roberta_tokenizer(t["sentence"],
#                         padding="max_length", truncation=True,
#                         max_length=256, return_tensors="pt")
        
#         logits = roberta_model(**tokens).logits
#         logits = torch.Tensor(logits.reshape(logits.shape[0], 3, len(properties_list)))

#         # get probabilities using softmax
#         probs = torch.softmax(logits, axis=1)
#         y_pred = np.argmax(probs, axis=1)
#         roberta_pred[i] = y_pred
#         if i % 10 == 0:
#             print(i)


In [11]:
# annotator_model_df = per_annotator_df.iloc[list(roberta_pred.keys())]
# annotator_model_df["model_pred"] = torch.stack(list(roberta_pred.values())).numpy().squeeze().tolist()
# annotator_model_df.head()


In [12]:
# annotator_model_df.to_csv("roberta_pred_V1.csv")


In [13]:
annotator_model_df = pd.read_csv("roberta_pred_V1.csv")

annotator_model_df["labels"] = annotator_model_df["labels"].apply(
    lambda x: np.loadtxt(StringIO(x.replace("[", "").replace("]", "")))
)
annotator_model_df["model_pred"] = annotator_model_df["model_pred"].apply(eval)


### Active and Passive

In [14]:
from sklearn.metrics import f1_score, precision_score, recall_score

def evaluate_active_passive(annotator_model_df):
    per_annotator = np.stack(annotator_model_df["labels"].to_list()).T
    model_preds = np.stack(annotator_model_df["model_pred"].to_list()).T

    scores = []
    for i, property in enumerate(properties_list):
        y = per_annotator[i] > 1
        y_pred = model_preds[i] > 1
        f1_micro = f1_score(y, y_pred, labels=[0, 1], average='micro')
        f1_binary = f1_score(y, y_pred, labels=[0, 1], average='binary')
        precision = precision_score(y, y_pred, labels=[0, 1], average='binary')
        recall = recall_score(y, y_pred, labels=[0, 1], average='binary')

        scores.append({
            "property": property, 
            "F1_micro": f1_micro,
            "F1_binary": f1_binary,
            "precision": precision,
            "recall": recall
        })
    
    y = per_annotator > 1
    y_pred = model_preds > 1
    f1_micro = f1_score(y, y_pred, labels=[0, 1], average='micro')
    f1_macro = f1_score(y, y_pred, labels=[0, 1], average='macro')

    scores.append({
        "property": "overall", 
        "F1_micro": f1_micro,
        "F1_macro": f1_macro,
    })

    return scores

scores = evaluate_active_passive(annotator_model_df)
pd.DataFrame(scores)


Unnamed: 0,property,F1_micro,F1_binary,precision,recall,F1_macro
0,awareness,0.918033,0.897959,0.873646,0.923664,
1,change_of_location,0.940387,0.529412,0.54878,0.511364,
2,change_of_state,0.783159,0.680571,0.70615,0.65678,
3,change_of_possession,0.960507,0.644295,0.676056,0.615385,
4,existed_after,0.823398,0.880363,0.847425,0.915966,
5,existed_before,0.83383,0.87618,0.854821,0.898633,
6,existed_during,0.90313,0.945286,0.925041,0.966437,
7,instigation,0.895678,0.862475,0.83619,0.890467,
8,sentient,0.932936,0.867257,0.832861,0.904615,
9,volition,0.915797,0.884103,0.860279,0.909283,


### Active Only

In [15]:
def evaluate_active_only(annotator_model_df):
    per_annotator = np.stack(annotator_model_df[annotator_model_df["structure"] == "active_full"]["labels"].to_list()).T
    model_preds = np.stack(annotator_model_df[annotator_model_df["structure"] == "active_full"]["model_pred"].to_list()).T

    scores = []
    for i, property in enumerate(properties_list):
        y = per_annotator[i] > 1
        y_pred = model_preds[i] > 1
        f1_micro = f1_score(y, y_pred, labels=[0, 1], average='micro')
        f1_binary = f1_score(y, y_pred, labels=[0, 1], average='binary')
        precision = precision_score(y, y_pred, labels=[0, 1], average='binary')
        recall = recall_score(y, y_pred, labels=[0, 1], average='binary')

        scores.append({
            "property": property, 
            "F1_micro": f1_micro,
            "F1_binary": f1_binary,
            "precision": precision,
            "recall": recall
        })
    
    y = per_annotator > 1
    y_pred = model_preds > 1
    f1_micro = f1_score(y, y_pred, labels=[0, 1], average='micro')
    f1_macro = f1_score(y, y_pred, labels=[0, 1], average='macro')

    scores.append({
        "property": "overall", 
        "F1_micro": f1_micro,
        "F1_macro": f1_macro,
    })

    return scores

scores = evaluate_active_only(annotator_model_df)
pd.DataFrame(scores)


Unnamed: 0,property,F1_micro,F1_binary,precision,recall,F1_macro
0,awareness,0.904663,0.880519,0.860406,0.901596,
1,change_of_location,0.936788,0.534351,0.530303,0.538462,
2,change_of_state,0.778238,0.674772,0.687307,0.662687,
3,change_of_possession,0.956477,0.618182,0.653846,0.586207,
4,existed_after,0.812435,0.870991,0.838134,0.906528,
5,existed_before,0.825907,0.86875,0.852761,0.88535,
6,existed_during,0.888083,0.935407,0.915691,0.95599,
7,instigation,0.883938,0.844444,0.826087,0.863636,
8,sentient,0.919171,0.841463,0.808594,0.877119,
9,volition,0.902591,0.86533,0.845938,0.88563,


### Passive Only

In [16]:
def evaluate_passive_only(annotator_model_df):
    per_annotator = np.stack(annotator_model_df[annotator_model_df["structure"] == "passive_full"]["labels"].to_list()).T
    model_preds = np.stack(annotator_model_df[annotator_model_df["structure"] == "passive_full"]["model_pred"].to_list()).T

    scores = []
    for i, property in enumerate(properties_list):
        y = per_annotator[i] > 1
        y_pred = model_preds[i] > 1
        f1_micro = f1_score(y, y_pred, labels=[0, 1], average='micro')
        f1_binary = f1_score(y, y_pred, labels=[0, 1], average='binary')
        precision = precision_score(y, y_pred, labels=[0, 1], average='binary')
        recall = recall_score(y, y_pred, labels=[0, 1], average='binary')

        scores.append({
            "property": property, 
            "F1_micro": f1_micro,
            "F1_binary": f1_binary,
            "precision": precision,
            "recall": recall
        })
    
    y = per_annotator > 1
    y_pred = model_preds > 1
    f1_micro = f1_score(y, y_pred, labels=[0, 1], average='micro')
    f1_macro = f1_score(y, y_pred, labels=[0, 1], average='macro')

    scores.append({
        "property": "overall", 
        "F1_micro": f1_micro,
        "F1_macro": f1_macro,
    })

    return scores

    return scores

scores = evaluate_passive_only(annotator_model_df)
pd.DataFrame(scores)


Unnamed: 0,property,F1_micro,F1_binary,precision,recall,F1_macro
0,awareness,0.952255,0.941558,0.90625,0.97973,
1,change_of_location,0.949602,0.512821,0.625,0.434783,
2,change_of_state,0.795756,0.695652,0.758621,0.642336,
3,change_of_possession,0.970822,0.717949,0.736842,0.7,
4,existed_after,0.851459,0.903114,0.87,0.938849,
5,existed_before,0.854111,0.894434,0.859779,0.932,
6,existed_during,0.941645,0.96875,0.947222,0.991279,
7,instigation,0.925729,0.90604,0.859873,0.957447,
8,sentient,0.96817,0.935484,0.896907,0.977528,
9,volition,0.949602,0.931408,0.895833,0.969925,


## GPT-2

In [17]:
# gpt_pred = {}

# with torch.no_grad():
#     for t in test:
#         i = t["index"]
#         tokens = gpt_tokenizer(t["sentence"],
#                         padding="max_length", truncation=True,
#                         max_length=256, return_tensors="pt")
        
#         logits = gpt_model(**tokens).logits
#         logits = torch.Tensor(logits.reshape(logits.shape[0], 3, len(properties_list)))

#         # get probabilities using softmax
#         probs = torch.softmax(logits, axis=1)
#         y_pred = np.argmax(probs, axis=1)
#         gpt_pred[i] = y_pred
#         if i % 10 == 0:
#             print(i)


In [18]:
# annotator_model_df = per_annotator_df.iloc[list(gpt_pred.keys())]
# annotator_model_df["model_pred"] = torch.stack(list(gpt_pred.values())).numpy().squeeze().tolist()
# annotator_model_df.to_csv("gpt_pred_V1.csv")


In [19]:
annotator_model_df = pd.read_csv("gpt_pred_V1.csv")

annotator_model_df["labels"] = annotator_model_df["labels"].apply(
    lambda x: np.loadtxt(StringIO(x.replace("[", "").replace("]", "")))
)
annotator_model_df["model_pred"] = annotator_model_df["model_pred"].apply(eval)


### Active and Passive

In [20]:
scores = evaluate_active_passive(annotator_model_df)
pd.DataFrame(scores)


Unnamed: 0,property,F1_micro,F1_binary,precision,recall,F1_macro
0,awareness,0.916542,0.895131,0.878676,0.912214,
1,change_of_location,0.944113,0.545455,0.584416,0.511364,
2,change_of_state,0.783159,0.679162,0.708046,0.652542,
3,change_of_possession,0.960507,0.644295,0.676056,0.615385,
4,existed_after,0.818927,0.874936,0.857719,0.892857,
5,existed_before,0.817437,0.86166,0.854423,0.869021,
6,existed_during,0.909091,0.948129,0.936975,0.959552,
7,instigation,0.891207,0.854582,0.83953,0.870183,
8,sentient,0.936662,0.871017,0.859281,0.883077,
9,volition,0.913562,0.879167,0.868313,0.890295,


### Active Only

In [21]:
scores = evaluate_active_only(annotator_model_df)
pd.DataFrame(scores)


Unnamed: 0,property,F1_micro,F1_binary,precision,recall,F1_macro
0,awareness,0.902591,0.877604,0.859694,0.896277,
1,change_of_location,0.939896,0.532258,0.559322,0.507692,
2,change_of_state,0.781347,0.669797,0.703947,0.638806,
3,change_of_possession,0.956477,0.618182,0.653846,0.586207,
4,existed_after,0.813472,0.868613,0.854885,0.882789,
5,existed_before,0.81658,0.858964,0.859649,0.85828,
6,existed_during,0.897409,0.940109,0.930539,0.949878,
7,instigation,0.871503,0.828255,0.808108,0.849432,
8,sentient,0.923316,0.845188,0.834711,0.855932,
9,volition,0.900518,0.861671,0.847025,0.876833,


### Passive Only

In [22]:
scores = evaluate_passive_only(annotator_model_df)
pd.DataFrame(scores)


Unnamed: 0,property,F1_micro,F1_binary,precision,recall,F1_macro
0,awareness,0.952255,0.94,0.927632,0.952703,
1,change_of_location,0.954907,0.585366,0.666667,0.521739,
2,change_of_state,0.787798,0.701493,0.717557,0.686131,
3,change_of_possession,0.970822,0.717949,0.736842,0.7,
4,existed_after,0.832891,0.890052,0.864407,0.917266,
5,existed_before,0.819629,0.868217,0.842105,0.896,
6,existed_during,0.938992,0.967096,0.952113,0.982558,
7,instigation,0.941645,0.921986,0.921986,0.921986,
8,sentient,0.970822,0.939227,0.923913,0.955056,
9,volition,0.94695,0.924812,0.924812,0.924812,
