In [1]:
import pandas as pd
import numpy as np
from io import StringIO
import transformers
import torch
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import f1_score
from scipy.stats import spearmanr


from sys import path
path.append("../")
from decompose.analysis import preprocess_input



In [2]:
roles_wide = pd.read_csv("../../dataV2/combined_V1_V2.csv", na_filter=False)
roles_wide.drop("Unnamed: 0", inplace=True, axis=1)

print(roles_wide.columns)
print(roles_wide.shape)
roles_wide.head()


Index(['Sentence.ID', 'Roleset', 'Gram.Func', 'Sentence', 'Predicate', 'Split',
       'Arg.Phrase', 'Arg.Stripped', 'Arg', 'arg_idx', 'verb_idx', 'structure',
       'decomp_version', 'Pred.Lemma', 'modified_sentence', 'combined_labels',
       'VN_mappings'],
      dtype='object')
(28816, 17)


Unnamed: 0,Sentence.ID,Roleset,Gram.Func,Sentence,Predicate,Split,Arg.Phrase,Arg.Stripped,Arg,arg_idx,verb_idx,structure,decomp_version,Pred.Lemma,modified_sentence,combined_labels,VN_mappings
0,0003_21,impose.01,nsubj,"In July , a gradual ban was imposed by the Env...",was imposed by,test,a gradual ban,a ban,1,"(3, 6)","(6, 9)",passive_full,V1,impose,1,"[1.0, 1.0, 3.0, 1.0, 5.0, 1.0, 5.0, 1.0, 1.0, ...",
1,0003_21,impose.01,dobj,"In July , a gradual ban was imposed by the Env...",was imposed by,train,the Environmental Protection Agency,the Environmental Protection Agency,0,"(9, 13)","(6, 9)",passive_full,V1,impose,1,"[5.0, 1.0, 3.0, 1.0, 5.0, 5.0, 5.0, 5.0, 1.0, ...",
2,0003_29,have.03,nsubj,No bearing on our work force today is had by it .,is had by,train,No bearing on our work force today,no bearing,1,"(0, 7)","(7, 10)",passive_full,V1,have,1,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",
3,0003_29,have.03,dobj,No bearing on our work force today is had by it .,is had by,test,it,It,0,"(10, 11)","(7, 10)",passive_full,V1,have,1,"[3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 5.0, 3.0, 2.0, ...",
4,0003_9,lead.02,nsubj,A team of researchers from the National Cancer...,was led by,train,A team of researchers from the National Cancer...,a team,1,"(0, 19)","(19, 22)",passive_full,V1,lead,1,"[5.0, 3.0, 3.0, 1.0, 3.0, 3.0, 5.0, 3.0, 1.0, ...",


In [3]:
roles_wide.rename({"Sentence": "sentence", "Arg.Phrase": "arg"}, axis=1, inplace=True)

properties_list = ['awareness', 'change_of_location', 'change_of_state',
       'change_of_possession', 'existed_after', 'existed_before',
       'existed_during', 'instigation', 'sentient', 'volition']

def make_labels_onehot(x):
    x = np.array(x)
    labels = np.zeros((3, len(properties_list)))
    # 1 or 2 (negative)
    labels[0, :] = (x <= 2)
    labels[1, :] = np.abs(x - 3) < 1
    labels[2, :] = (x >= 4)
    # print(labels.flatten().reshape(3, -1))
    return labels.flatten()

def make_labels(x):
    x = np.array(x)
    labels = np.ones(len(properties_list))
    labels[np.where(x <= 2)] = 0
    labels[np.where(x >= 4)] = 2
    return labels

roles_wide["combined_labels"] = roles_wide["combined_labels"].apply(lambda x: eval(x))
roles_wide['labels_onehot'] = roles_wide["combined_labels"].apply(make_labels_onehot)
roles_wide['labels'] = roles_wide["combined_labels"].apply(make_labels)

roles_wide.head()


Unnamed: 0,Sentence.ID,Roleset,Gram.Func,sentence,Predicate,Split,arg,Arg.Stripped,Arg,arg_idx,verb_idx,structure,decomp_version,Pred.Lemma,modified_sentence,combined_labels,VN_mappings,labels_onehot,labels
0,0003_21,impose.01,nsubj,"In July , a gradual ban was imposed by the Env...",was imposed by,test,a gradual ban,a ban,1,"(3, 6)","(6, 9)",passive_full,V1,impose,1,"[1.0, 1.0, 3.0, 1.0, 5.0, 1.0, 5.0, 1.0, 1.0, ...",,"[1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, ...","[0.0, 0.0, 1.0, 0.0, 2.0, 0.0, 2.0, 0.0, 0.0, ..."
1,0003_21,impose.01,dobj,"In July , a gradual ban was imposed by the Env...",was imposed by,train,the Environmental Protection Agency,the Environmental Protection Agency,0,"(9, 13)","(6, 9)",passive_full,V1,impose,1,"[5.0, 1.0, 3.0, 1.0, 5.0, 5.0, 5.0, 5.0, 1.0, ...",,"[0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","[2.0, 0.0, 1.0, 0.0, 2.0, 2.0, 2.0, 2.0, 0.0, ..."
2,0003_29,have.03,nsubj,No bearing on our work force today is had by it .,is had by,train,No bearing on our work force today,no bearing,1,"(0, 7)","(7, 10)",passive_full,V1,have,1,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,0003_29,have.03,dobj,No bearing on our work force today is had by it .,is had by,test,it,It,0,"(10, 11)","(7, 10)",passive_full,V1,have,1,"[3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 5.0, 3.0, 2.0, ...",,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 0.0, ..."
4,0003_9,lead.02,nsubj,A team of researchers from the National Cancer...,was led by,train,A team of researchers from the National Cancer...,a team,1,"(0, 19)","(19, 22)",passive_full,V1,lead,1,"[5.0, 3.0, 3.0, 1.0, 3.0, 3.0, 5.0, 3.0, 1.0, ...",,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","[2.0, 1.0, 1.0, 0.0, 1.0, 1.0, 2.0, 1.0, 0.0, ..."


### Inter-annotator agreement

In [4]:
per_annotator_df = roles_wide.loc[
    (roles_wide["decomp_version"] == "V2") & \
    (roles_wide["Split"] == "test")
    # (roles_wide["modified_sentence"] == 0)
    ][["Sentence.ID", "Pred.Lemma", "Gram.Func", "arg", "combined_labels", "labels", "sentence", "Predicate", "arg_idx", "verb_idx", "structure", "modified_sentence"]].pivot_table(
    index=["Sentence.ID", "Pred.Lemma", "Gram.Func", "arg", "sentence", "Predicate", "arg_idx", "verb_idx",  "structure", "modified_sentence"], values=["combined_labels", "labels"], aggfunc=np.stack).reset_index()

print(per_annotator_df.shape)
per_annotator_df["labels"] = per_annotator_df["labels"].apply(
    lambda x: x[:2] if x.shape[0] >= 2 else np.nan
)
per_annotator_df["combined_labels"] = per_annotator_df["combined_labels"].apply(
    lambda x: x[:2] if x.shape[0] >= 2 else np.nan
)
per_annotator_df.dropna(subset=["labels"], inplace=True)
print(per_annotator_df.shape)


(640, 12)
(632, 12)


In [5]:
print(per_annotator_df.shape)
per_annotator = np.stack(per_annotator_df["labels"].to_list())
print(per_annotator.shape)

scores = []
for i, property in enumerate(properties_list):
    y1 = per_annotator[:, 0, i].flatten()
    y2 = per_annotator[:, 1, i].flatten()
    kappa = cohen_kappa_score(y1, y2, labels=[0, 1, 2])
    f1_1 = f1_score(y1, y2, labels=[0, 1, 2], average='micro')
    f1_2 = f1_score(y2, y1, labels=[0, 1, 2], average='micro')
    assert f1_1 == f1_2
    rho = spearmanr(y1, y2)[0]
    scores.append({
        "property": property, "Cohen Kappa": kappa, "F1": f1_1, "Rho": rho,
    })

pd.DataFrame(scores)


(632, 12)
(632, 2, 10)


Unnamed: 0,property,Cohen Kappa,F1,Rho
0,awareness,0.746565,0.868671,0.789665
1,change_of_location,0.191985,0.496835,0.261405
2,change_of_state,0.139438,0.450949,0.176156
3,change_of_possession,0.297747,0.762658,0.31512
4,existed_after,0.234712,0.848101,0.292082
5,existed_before,0.379917,0.833861,0.442872
6,existed_during,0.220789,0.958861,0.305609
7,instigation,0.301169,0.685127,0.326946
8,sentient,0.756589,0.871835,0.811543
9,volition,0.69846,0.84019,0.761569


In [6]:
print(per_annotator_df.shape)
per_annotator = np.stack(per_annotator_df["combined_labels"].to_list())
print(per_annotator.shape)

scores = []
for i, property in enumerate(properties_list):
    y1 = per_annotator[:, 0, i].flatten().astype(int)
    y2 = per_annotator[:, 1, i].flatten().astype(int)
    kappa = cohen_kappa_score(y1, y2, labels=np.arange(5))
    f1_1 = f1_score(y1, y2, labels=np.arange(5), average='micro')
    f1_2 = f1_score(y2, y1, labels=np.arange(5), average='micro')
    assert f1_1 == f1_2
    rho = spearmanr(y1, y2)[0]
    scores.append({
        "property": property,
        "Cohen Kappa": kappa, "F1": f1_1,
        "Rho": rho
    })

pd.DataFrame(scores)


(632, 12)
(632, 2, 10)


Unnamed: 0,property,Cohen Kappa,F1,Rho
0,awareness,0.261489,0.696921,0.790613
1,change_of_location,0.101355,0.351211,0.314649
2,change_of_state,0.0905,0.296558,0.178861
3,change_of_possession,0.145529,0.529943,0.250148
4,existed_after,0.13754,0.123967,0.252844
5,existed_before,0.139277,0.212454,0.471364
6,existed_during,0.333333,0.117647,0.234572
7,instigation,0.149777,0.252765,0.309721
8,sentient,0.242097,0.753846,0.809756
9,volition,0.257652,0.615385,0.766716


In [7]:
a = np.stack(per_annotator_df["combined_labels"].to_list())
spearmanr(a[:, 0, :].flatten(), a[:, 1, :].flatten())


SignificanceResult(statistic=0.7136104749049405, pvalue=0.0)

### Get model inference

In [8]:
roberta_path = "../combined_SPRL_models/roberta-large_dropout=0.1"
roberta_model = transformers.AutoModelForSequenceClassification.from_pretrained(roberta_path)
roberta_tokenizer = transformers.AutoTokenizer.from_pretrained(roberta_path + "/tokenizer")

gpt_path = "../combined_SPRL_models/gpt2-medium"
gpt_model = transformers.AutoModelForSequenceClassification.from_pretrained(gpt_path)
gpt_tokenizer = transformers.AutoTokenizer.from_pretrained(gpt_path + "/tokenizer")

bert_path = "../combined_SPRL_models/bert-large-cased"
bert_model = transformers.AutoModelForSequenceClassification.from_pretrained(bert_path)
bert_tokenizer = transformers.AutoTokenizer.from_pretrained(bert_path + "/tokenizer")


In [9]:
inputs = per_annotator_df.reset_index()[["sentence", "arg", "arg_idx", "Predicate", "verb_idx", "structure", "modified_sentence"]].to_numpy()
inputs[:5]


array([['Her hand is raised by a girl .', 'a girl', '(5, 7)',
        'is raised by', '(2, 5)', 'passive_full', 1],
       ['A girl raises her hand .', 'her hand', '(3, 5)', 'raises',
        '(2, 3)', 'active_full', 0],
       ['A girl raises her hand .', 'A girl', '(0, 2)', 'raises',
        '(2, 3)', 'active_full', 0],
       ['Her hand is raised by a girl .', 'Her hand', '(0, 2)',
        'is raised by', '(2, 5)', 'passive_full', 1],
       ['" I \'m afraid not , " explains Winston , " that is what we would call a GREAT LOSS . "',
        'that', '(11, 12)', 'is', '(12, 13)', 'active_full', 0]],
      dtype=object)

In [10]:
test = []
test_truncated = []

for i, (sentence, arg, arg_idx, verb, verb_idx, structure, modified) in enumerate(inputs):
    try:
        arg_idx = eval(arg_idx)
        verb_idx = eval(verb_idx)
        sentence, _ = preprocess_input.format_input(sentence, arg_idx, verb_idx)
        test.append({"sentence": sentence,
                     "index": i, 
                     "structure": structure.split("_")[0],
                     "modified": modified})
        sentence_truncated = "<a>".join(sentence.split("<a>")[:-1]) + "<a>"
        test_truncated.append({"sentence": sentence_truncated,
                     "index": i, 
                     "structure": structure.split("_")[0],
                     "modified": modified})
    except ValueError as e:
        print(e)

print(len(test))


632


In [11]:
test[:5]


[{'sentence': 'Her hand<p> is raised by<p><a> a girl<a>.',
  'index': 0,
  'structure': 'passive',
  'modified': 1},
 {'sentence': 'A girl<p> raises<p><a> her hand<a>.',
  'index': 1,
  'structure': 'active',
  'modified': 0},
 {'sentence': '<a>A girl<a><p> raises<p> her hand.',
  'index': 2,
  'structure': 'active',
  'modified': 0},
 {'sentence': '<a>Her hand<a><p> is raised by<p> a girl.',
  'index': 3,
  'structure': 'passive',
  'modified': 1},
 {'sentence': '"I \'m afraid not", explains Winston, "<a> that<a><p> is<p> what we would call a GREAT LOSS".',
  'index': 4,
  'structure': 'active',
  'modified': 0}]

In [12]:
test_truncated[:5]


[{'sentence': 'Her hand<p> is raised by<p><a> a girl<a>',
  'index': 0,
  'structure': 'passive',
  'modified': 1},
 {'sentence': 'A girl<p> raises<p><a> her hand<a>',
  'index': 1,
  'structure': 'active',
  'modified': 0},
 {'sentence': '<a>A girl<a>',
  'index': 2,
  'structure': 'active',
  'modified': 0},
 {'sentence': '<a>Her hand<a>',
  'index': 3,
  'structure': 'passive',
  'modified': 1},
 {'sentence': '"I \'m afraid not", explains Winston, "<a> that<a>',
  'index': 4,
  'structure': 'active',
  'modified': 0}]

## Roberta

In [13]:
roberta_pred = {}

with torch.no_grad():
    for t in test:
        i = t["index"]
        tokens = roberta_tokenizer(t["sentence"],
                        padding="max_length", truncation=True,
                        max_length=256, return_tensors="pt")
        
        logits = roberta_model(**tokens).logits
        logits = torch.Tensor(logits.reshape(logits.shape[0], 3, len(properties_list)))

        # get probabilities using softmax
        probs = torch.softmax(logits, axis=1)
        y_pred = np.argmax(probs, axis=1)
        roberta_pred[i] = y_pred
        if i % 10 == 0:
            print(i)


0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490
500
510
520
530
540
550
560
570
580
590
600
610
620
630


In [14]:
annotator_model_df = per_annotator_df.iloc[list(roberta_pred.keys())]
annotator_model_df["model_pred"] = torch.stack(list(roberta_pred.values())).numpy().squeeze().tolist()
annotator_model_df.head()


Unnamed: 0,Sentence.ID,Pred.Lemma,Gram.Func,arg,sentence,Predicate,arg_idx,verb_idx,structure,modified_sentence,combined_labels,labels,model_pred
0,en-ud-test.conllu 1002,raise,dobj,a girl,Her hand is raised by a girl .,is raised by,"(5, 7)","(2, 5)",passive_full,1,"[[5.0, 2.0, 3.0, 3.0, 5.0, 5.0, 5.0, 2.0, 5.0,...","[[2.0, 0.0, 1.0, 1.0, 2.0, 2.0, 2.0, 0.0, 2.0,...","[2, 1, 1, 0, 2, 2, 2, 2, 2, 2]"
1,en-ud-test.conllu 1002,raise,dobj,her hand,A girl raises her hand .,raises,"(3, 5)","(2, 3)",active_full,0,"[[1.0, 5.0, 1.0, 1.0, 5.0, 5.0, 5.0, 1.0, 1.0,...","[[0.0, 2.0, 0.0, 0.0, 2.0, 2.0, 2.0, 0.0, 0.0,...","[0, 2, 2, 0, 2, 2, 2, 0, 0, 0]"
2,en-ud-test.conllu 1002,raise,nsubj,A girl,A girl raises her hand .,raises,"(0, 2)","(2, 3)",active_full,0,"[[5.0, 2.0, 3.0, 3.0, 5.0, 5.0, 5.0, 2.0, 5.0,...","[[2.0, 0.0, 1.0, 1.0, 2.0, 2.0, 2.0, 0.0, 2.0,...","[2, 1, 1, 0, 2, 2, 2, 2, 2, 2]"
3,en-ud-test.conllu 1002,raise,nsubj,Her hand,Her hand is raised by a girl .,is raised by,"(0, 2)","(2, 5)",passive_full,1,"[[1.0, 5.0, 1.0, 1.0, 5.0, 5.0, 5.0, 1.0, 1.0,...","[[0.0, 2.0, 0.0, 0.0, 2.0, 2.0, 2.0, 0.0, 0.0,...","[0, 2, 2, 0, 2, 2, 2, 0, 0, 0]"
4,en-ud-test.conllu 1004,be,nsubj,that,""" I 'm afraid not , "" explains Winston , "" tha...",is,"(11, 12)","(12, 13)",active_full,0,"[[3.0, 3.0, 3.0, 3.0, 3.0, 1.0, 5.0, 3.0, 1.0,...","[[1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 2.0, 1.0, 0.0,...","[0, 0, 0, 0, 2, 2, 2, 2, 0, 0]"


In [15]:
annotator_model_df.to_csv("roberta_pred.csv")


In [16]:
annotator_model_df = pd.read_csv("roberta_pred.csv")

annotator_model_df["labels"] = annotator_model_df["labels"].apply(
    lambda x: np.loadtxt(StringIO(x.replace("[", "").replace("]", "")))
)
annotator_model_df["model_pred"] = annotator_model_df["model_pred"].apply(eval)


### Active and Passive

In [17]:
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import f1_score

def evaluate_active_passive(annotator_model_df):
    per_annotator = np.stack(annotator_model_df["labels"].to_list()).transpose((2, 1, 0))
    model_preds = np.stack(annotator_model_df["model_pred"].to_list()).T

    scores = []
    for i, property in enumerate(properties_list):
        y1 = per_annotator[i, 0] < 2
        y2 = per_annotator[i, 1] < 2
        y_pred = model_preds[i] < 2

        kappa_human = cohen_kappa_score(y1, y2, labels=[0, 1])
        f1_human = f1_score(y1, y2, labels=[0, 1], average='micro')

        kappa_model_1 = cohen_kappa_score(y1, y_pred, labels=[0, 1])
        kappa_model_2 = cohen_kappa_score(y2, y_pred, labels=[0, 1])
        f1_model_1 = f1_score(y1, y_pred, labels=[0, 1], average='micro')
        f1_model_2 = f1_score(y2, y_pred, labels=[0, 1], average='micro')

        assert f1_1 == f1_2
        scores.append({
            "property": property, "Kappa_humans": kappa_human, "F1_humans": f1_human,
            "Kappa_model_1": kappa_model_1, "Kappa_model_2": kappa_model_2, 
            "Kappa_model_avg": np.mean([kappa_model_1, kappa_model_2]),
            "F1_model_1": f1_model_1, "F1_model_2": f1_model_2,
            "F1_model_avg": np.mean([f1_model_1, f1_model_2]),
        })

    return scores

scores = evaluate_active_passive(annotator_model_df)
pd.DataFrame(scores)


Unnamed: 0,property,Kappa_humans,F1_humans,Kappa_model_1,Kappa_model_2,Kappa_model_avg,F1_model_1,F1_model_2,F1_model_avg
0,awareness,0.785152,0.893987,0.834488,0.799148,0.816818,0.917722,0.900316,0.909019
1,change_of_location,0.284605,0.778481,0.273465,0.290256,0.28186,0.78481,0.794304,0.789557
2,change_of_state,0.159889,0.662975,0.183825,0.213714,0.19877,0.65981,0.670886,0.665348
3,change_of_possession,0.333503,0.908228,0.287285,0.413457,0.350371,0.911392,0.93038,0.920886
4,existed_after,0.278614,0.860759,0.23067,0.334225,0.282448,0.856013,0.900316,0.878165
5,existed_before,0.431633,0.852848,0.520836,0.467461,0.494148,0.871835,0.863924,0.86788
6,existed_during,0.277989,0.962025,0.0,0.0,0.0,0.962025,0.984177,0.973101
7,instigation,0.320917,0.710443,0.315846,0.327072,0.321459,0.726266,0.753165,0.739715
8,sentient,0.781606,0.890823,0.835455,0.819848,0.827651,0.917722,0.90981,0.913766
9,volition,0.746863,0.873418,0.797491,0.734175,0.765833,0.898734,0.867089,0.882911


### Active Only

In [18]:
def evaluate_active_only(annotator_model_df):
    per_annotator = np.stack(annotator_model_df.loc[annotator_model_df["structure"] == "active_full"]["labels"].to_list()).transpose((2, 1, 0))
    model_preds = np.stack(annotator_model_df.loc[annotator_model_df["structure"] == "active_full"]["model_pred"].to_list()).T

    scores = []
    for i, property in enumerate(properties_list):
        y1 = per_annotator[i, 0] < 2
        y2 = per_annotator[i, 1] < 2
        y_pred = model_preds[i] < 2

        kappa_human = cohen_kappa_score(y1, y2, labels=[0, 1])
        f1_human = f1_score(y1, y2, labels=[0, 1], average='micro')

        kappa_model_1 = cohen_kappa_score(y1, y_pred, labels=[0, 1])
        kappa_model_2 = cohen_kappa_score(y2, y_pred, labels=[0, 1])
        f1_model_1 = f1_score(y1, y_pred, labels=[0, 1], average='micro')
        f1_model_2 = f1_score(y2, y_pred, labels=[0, 1], average='micro')

        assert f1_1 == f1_2
        scores.append({
            "property": property, "Kappa_humans": kappa_human, "F1_humans": f1_human,
            "Kappa_model_1": kappa_model_1, "Kappa_model_2": kappa_model_2, 
            "Kappa_model_avg": np.mean([kappa_model_1, kappa_model_2]),
            "F1_model_1": f1_model_1, "F1_model_2": f1_model_2,
            "F1_model_avg": np.mean([f1_model_1, f1_model_2]),
        })

    return scores

scores = evaluate_active_only(annotator_model_df)
pd.DataFrame(scores)


Unnamed: 0,property,Kappa_humans,F1_humans,Kappa_model_1,Kappa_model_2,Kappa_model_avg,F1_model_1,F1_model_2,F1_model_avg
0,awareness,0.788189,0.897368,0.849662,0.811465,0.830564,0.926316,0.907895,0.917105
1,change_of_location,0.302569,0.778947,0.284557,0.312345,0.298451,0.792105,0.807895,0.8
2,change_of_state,0.127477,0.65,0.220048,0.208175,0.214112,0.673684,0.671053,0.672368
3,change_of_possession,0.306569,0.913158,0.269593,0.395866,0.33273,0.918421,0.936842,0.927632
4,existed_after,0.259075,0.855263,0.194503,0.377656,0.28608,0.847368,0.902632,0.875
5,existed_before,0.416361,0.85,0.505234,0.470028,0.487631,0.868421,0.865789,0.867105
6,existed_during,0.267729,0.960526,0.0,0.0,0.0,0.963158,0.981579,0.972368
7,instigation,0.321521,0.723684,0.313203,0.342375,0.327789,0.739474,0.773684,0.756579
8,sentient,0.78721,0.894737,0.846757,0.83586,0.841309,0.923684,0.918421,0.921053
9,volition,0.735147,0.868421,0.798476,0.725227,0.761852,0.9,0.863158,0.881579


### Passive Only

In [19]:
def evaluate_passive_only(annotator_model_df):
    per_annotator = np.stack(annotator_model_df.loc[annotator_model_df["structure"] == "passive_full"]["labels"].to_list()).transpose((2, 1, 0))
    model_preds = np.stack(annotator_model_df.loc[annotator_model_df["structure"] == "passive_full"]["model_pred"].to_list()).T

    scores = []
    for i, property in enumerate(properties_list):
        y1 = per_annotator[i, 0] < 2
        y2 = per_annotator[i, 1] < 2
        y_pred = model_preds[i] < 2

        kappa_human = cohen_kappa_score(y1, y2, labels=[0, 1])
        f1_human = f1_score(y1, y2, labels=[0, 1], average='micro')

        kappa_model_1 = cohen_kappa_score(y1, y_pred, labels=[0, 1])
        kappa_model_2 = cohen_kappa_score(y2, y_pred, labels=[0, 1])
        f1_model_1 = f1_score(y1, y_pred, labels=[0, 1], average='micro')
        f1_model_2 = f1_score(y2, y_pred, labels=[0, 1], average='micro')

        assert f1_1 == f1_2
        scores.append({
            "property": property, "Kappa_humans": kappa_human, "F1_humans": f1_human,
            "Kappa_model_1": kappa_model_1, "Kappa_model_2": kappa_model_2, 
            "Kappa_model_avg": np.mean([kappa_model_1, kappa_model_2]),
            "F1_model_1": f1_model_1, "F1_model_2": f1_model_2,
            "F1_model_avg": np.mean([f1_model_1, f1_model_2]),
        })
    return scores

scores = evaluate_passive_only(annotator_model_df)
pd.DataFrame(scores)


Unnamed: 0,property,Kappa_humans,F1_humans,Kappa_model_1,Kappa_model_2,Kappa_model_avg,F1_model_1,F1_model_2,F1_model_avg
0,awareness,0.777708,0.888889,0.809584,0.778127,0.793855,0.904762,0.888889,0.896825
1,change_of_location,0.255382,0.777778,0.260807,0.260807,0.260807,0.77381,0.77381,0.77381
2,change_of_state,0.20935,0.68254,0.128856,0.221759,0.175307,0.638889,0.670635,0.654762
3,change_of_possession,0.364407,0.900794,0.306167,0.43128,0.368724,0.900794,0.920635,0.910714
4,existed_after,0.309073,0.869048,0.287037,0.260163,0.2736,0.869048,0.896825,0.882937
5,existed_before,0.454217,0.857143,0.544011,0.463634,0.503823,0.876984,0.861111,0.869048
6,existed_during,0.294776,0.964286,0.0,0.0,0.0,0.960317,0.988095,0.974206
7,instigation,0.313927,0.690476,0.314361,0.301829,0.308095,0.706349,0.722222,0.714286
8,sentient,0.768529,0.884921,0.814658,0.792395,0.803527,0.90873,0.896825,0.902778
9,volition,0.760091,0.880952,0.791815,0.743316,0.767565,0.896825,0.873016,0.884921


## GPT-2

In [20]:
gpt_pred = {}

with torch.no_grad():
    for t in test:
        i = t["index"]
        tokens = gpt_tokenizer(t["sentence"],
                        padding="max_length", truncation=True,
                        max_length=256, return_tensors="pt")
        
        logits = gpt_model(**tokens).logits
        logits = torch.Tensor(logits.reshape(logits.shape[0], 3, len(properties_list)))

        # get probabilities using softmax
        probs = torch.softmax(logits, axis=1)
        y_pred = np.argmax(probs, axis=1)
        gpt_pred[i] = y_pred
        if i % 10 == 0:
            print(i)


0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490
500
510
520
530
540
550
560
570
580
590
600
610
620
630


In [21]:
annotator_model_df = per_annotator_df.iloc[list(gpt_pred.keys())]
annotator_model_df["model_pred"] = torch.stack(list(gpt_pred.values())).numpy().squeeze().tolist()
annotator_model_df.to_csv("gpt_pred.csv")


In [22]:
annotator_model_df = pd.read_csv("gpt_pred.csv")

annotator_model_df["labels"] = annotator_model_df["labels"].apply(
    lambda x: np.loadtxt(StringIO(x.replace("[", "").replace("]", "")))
)
annotator_model_df["model_pred"] = annotator_model_df["model_pred"].apply(eval)


In [23]:
# np.save("gpt_pred.npy", torch.stack(list(gpt_pred.values())).numpy())
# np.save("roberta_pred.npy", torch.stack(list(roberta_pred.values())).numpy())


### Active and Passive

In [24]:
scores = evaluate_active_passive(annotator_model_df)
pd.DataFrame(scores)


Unnamed: 0,property,Kappa_humans,F1_humans,Kappa_model_1,Kappa_model_2,Kappa_model_avg,F1_model_1,F1_model_2,F1_model_avg
0,awareness,0.785152,0.893987,0.83406,0.798482,0.816271,0.917722,0.900316,0.909019
1,change_of_location,0.284605,0.778481,0.246847,0.321939,0.284393,0.794304,0.81962,0.806962
2,change_of_state,0.159889,0.662975,0.173411,0.2387,0.206056,0.677215,0.700949,0.689082
3,change_of_possession,0.333503,0.908228,0.260237,0.307204,0.28372,0.912975,0.922468,0.917722
4,existed_after,0.278614,0.860759,0.234331,0.253711,0.244021,0.857595,0.889241,0.873418
5,existed_before,0.431633,0.852848,0.527687,0.420194,0.473941,0.867089,0.843354,0.855222
6,existed_during,0.277989,0.962025,0.126493,0.258849,0.192671,0.960443,0.982595,0.971519
7,instigation,0.320917,0.710443,0.315691,0.303708,0.309699,0.705696,0.716772,0.711234
8,sentient,0.781606,0.890823,0.851269,0.835502,0.843386,0.925633,0.917722,0.921677
9,volition,0.746863,0.873418,0.75011,0.724673,0.737391,0.875,0.862342,0.868671


### Active Only

In [25]:
scores = evaluate_active_only(annotator_model_df)
pd.DataFrame(scores)


Unnamed: 0,property,Kappa_humans,F1_humans,Kappa_model_1,Kappa_model_2,Kappa_model_avg,F1_model_1,F1_model_2,F1_model_avg
0,awareness,0.788189,0.897368,0.838073,0.799464,0.818768,0.921053,0.902632,0.911842
1,change_of_location,0.302569,0.778947,0.257348,0.345258,0.301303,0.802632,0.834211,0.818421
2,change_of_state,0.127477,0.65,0.182297,0.221707,0.202002,0.684211,0.702632,0.693421
3,change_of_possession,0.306569,0.913158,0.278572,0.305066,0.291819,0.921053,0.928947,0.925
4,existed_after,0.259075,0.855263,0.183135,0.298116,0.240625,0.85,0.894737,0.872368
5,existed_before,0.416361,0.85,0.530888,0.458975,0.494931,0.871053,0.857895,0.864474
6,existed_during,0.267729,0.960526,0.096314,0.170708,0.133511,0.957895,0.976316,0.967105
7,instigation,0.321521,0.723684,0.328102,0.317112,0.322607,0.723684,0.736842,0.730263
8,sentient,0.78721,0.894737,0.857071,0.846038,0.851554,0.928947,0.923684,0.926316
9,volition,0.735147,0.868421,0.745443,0.704091,0.724767,0.873684,0.852632,0.863158


### Passive Only

In [26]:
scores = evaluate_passive_only(annotator_model_df)
pd.DataFrame(scores)


Unnamed: 0,property,Kappa_humans,F1_humans,Kappa_model_1,Kappa_model_2,Kappa_model_avg,F1_model_1,F1_model_2,F1_model_avg
0,awareness,0.777708,0.888889,0.82543,0.793846,0.809638,0.912698,0.896825,0.904762
1,change_of_location,0.255382,0.777778,0.236448,0.291979,0.264213,0.781746,0.797619,0.789683
2,change_of_state,0.20935,0.68254,0.162062,0.261168,0.211615,0.666667,0.698413,0.68254
3,change_of_possession,0.364407,0.900794,0.236178,0.308901,0.272539,0.900794,0.912698,0.906746
4,existed_after,0.309073,0.869048,0.309073,0.185345,0.247209,0.869048,0.880952,0.875
5,existed_before,0.454217,0.857143,0.523037,0.365346,0.444191,0.861111,0.821429,0.84127
6,existed_during,0.294776,0.964286,0.175872,0.497006,0.336439,0.964286,0.992063,0.978175
7,instigation,0.313927,0.690476,0.292429,0.279114,0.285771,0.678571,0.686508,0.68254
8,sentient,0.768529,0.884921,0.838988,0.81642,0.827704,0.920635,0.90873,0.914683
9,volition,0.760091,0.880952,0.751305,0.750766,0.751035,0.876984,0.876984,0.876984


## GPT-2 Truncated

In [27]:
gpt_pred = {}

with torch.no_grad():
    for t in test_truncated:
        i = t["index"]
        tokens = gpt_tokenizer(t["sentence"],
                        padding="max_length", truncation=True,
                        max_length=256, return_tensors="pt")
        
        logits = gpt_model(**tokens).logits
        logits = torch.Tensor(logits.reshape(logits.shape[0], 3, len(properties_list)))

        # get probabilities using softmax
        probs = torch.softmax(logits, axis=1)
        y_pred = np.argmax(probs, axis=1)
        gpt_pred[i] = y_pred
        if i % 10 == 0:
            print(i)


0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490
500
510
520
530
540
550
560
570
580
590
600
610
620
630


In [28]:
annotator_model_df = per_annotator_df.iloc[list(gpt_pred.keys())]
annotator_model_df["model_pred"] = torch.stack(list(gpt_pred.values())).numpy().squeeze().tolist()
annotator_model_df.to_csv("gpt_truncated_pred.csv")


In [29]:
annotator_model_df = pd.read_csv("gpt_truncated_pred.csv")

annotator_model_df["labels"] = annotator_model_df["labels"].apply(
    lambda x: np.loadtxt(StringIO(x.replace("[", "").replace("]", "")))
)
annotator_model_df["model_pred"] = annotator_model_df["model_pred"].apply(eval)


### Active Only

In [30]:
scores = evaluate_active_only(annotator_model_df)
pd.DataFrame(scores)


Unnamed: 0,property,Kappa_humans,F1_humans,Kappa_model_1,Kappa_model_2,Kappa_model_avg,F1_model_1,F1_model_2,F1_model_avg
0,awareness,0.788189,0.897368,0.848868,0.799464,0.824166,0.926316,0.902632,0.914474
1,change_of_location,0.302569,0.778947,0.125209,0.236899,0.181054,0.797368,0.834211,0.815789
2,change_of_state,0.127477,0.65,0.118244,0.157695,0.137969,0.686842,0.705263,0.696053
3,change_of_possession,0.306569,0.913158,0.247525,0.21844,0.232982,0.921053,0.923684,0.922368
4,existed_after,0.259075,0.855263,0.259075,0.321659,0.290367,0.855263,0.889474,0.872368
5,existed_before,0.416361,0.85,0.587567,0.39908,0.493323,0.876316,0.826316,0.851316
6,existed_during,0.267729,0.960526,0.087571,0.153675,0.120623,0.955263,0.973684,0.964474
7,instigation,0.321521,0.723684,0.300981,0.264097,0.282539,0.723684,0.731579,0.727632
8,sentient,0.78721,0.894737,0.758963,0.749037,0.754,0.878947,0.873684,0.876316
9,volition,0.735147,0.868421,0.722666,0.658934,0.6908,0.860526,0.828947,0.844737


### Passive Only

In [31]:
scores = evaluate_passive_only(annotator_model_df)
pd.DataFrame(scores)


Unnamed: 0,property,Kappa_humans,F1_humans,Kappa_model_1,Kappa_model_2,Kappa_model_avg,F1_model_1,F1_model_2,F1_model_avg
0,awareness,0.777708,0.888889,0.801587,0.769841,0.785714,0.900794,0.884921,0.892857
1,change_of_location,0.255382,0.777778,-0.056247,0.080043,0.011898,0.753968,0.785714,0.769841
2,change_of_state,0.20935,0.68254,0.125777,0.229116,0.177446,0.65873,0.690476,0.674603
3,change_of_possession,0.364407,0.900794,0.131256,0.20442,0.167838,0.892857,0.904762,0.89881
4,existed_after,0.309073,0.869048,0.18345,0.185345,0.184397,0.845238,0.880952,0.863095
5,existed_before,0.454217,0.857143,0.452797,0.355155,0.403976,0.833333,0.809524,0.821429
6,existed_during,0.294776,0.964286,0.155496,0.394231,0.274863,0.960317,0.988095,0.974206
7,instigation,0.313927,0.690476,0.228921,0.261638,0.245279,0.678571,0.718254,0.698413
8,sentient,0.768529,0.884921,0.797376,0.791921,0.794648,0.900794,0.896825,0.89881
9,volition,0.760091,0.880952,0.753191,0.721168,0.73718,0.876984,0.861111,0.869048


### BERT

In [32]:
bert_pred = {}

with torch.no_grad():
    for t in test:
        i = t["index"]
        tokens = bert_tokenizer(t["sentence"],
                        padding="max_length", truncation=True,
                        max_length=256, return_tensors="pt")
        
        logits = bert_model(**tokens).logits
        logits = torch.Tensor(logits.reshape(logits.shape[0], 3, len(properties_list)))

        # get probabilities using softmax
        probs = torch.softmax(logits, axis=1)
        y_pred = np.argmax(probs, axis=1)
        bert_pred[i] = y_pred
        if i % 10 == 0:
            print(i)


0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490
500
510
520
530
540
550
560
570
580
590
600
610
620
630


In [33]:
annotator_model_df = per_annotator_df.iloc[list(bert_pred.keys())]
annotator_model_df["model_pred"] = torch.stack(list(bert_pred.values())).numpy().squeeze().tolist()
annotator_model_df.to_csv("bert_pred.csv")


In [34]:
annotator_model_df = pd.read_csv("bert_pred.csv")

annotator_model_df["labels"] = annotator_model_df["labels"].apply(
    lambda x: np.loadtxt(StringIO(x.replace("[", "").replace("]", "")))
)
annotator_model_df["model_pred"] = annotator_model_df["model_pred"].apply(eval)


In [35]:
scores = evaluate_active_passive(annotator_model_df)
pd.DataFrame(scores)


Unnamed: 0,property,Kappa_humans,F1_humans,Kappa_model_1,Kappa_model_2,Kappa_model_avg,F1_model_1,F1_model_2,F1_model_avg
0,awareness,0.785152,0.893987,0.78557,0.788094,0.786832,0.893987,0.89557,0.894778
1,change_of_location,0.284605,0.778481,0.189254,0.313363,0.251309,0.78481,0.822785,0.803797
2,change_of_state,0.159889,0.662975,0.218758,0.219582,0.21917,0.699367,0.697785,0.698576
3,change_of_possession,0.333503,0.908228,0.363647,0.360135,0.361891,0.920886,0.924051,0.922468
4,existed_after,0.278614,0.860759,0.186224,0.253791,0.220007,0.868671,0.90981,0.889241
5,existed_before,0.431633,0.852848,0.468302,0.39664,0.432471,0.862342,0.851266,0.856804
6,existed_during,0.277989,0.962025,0.133455,0.279197,0.206326,0.962025,0.984177,0.973101
7,instigation,0.320917,0.710443,0.367818,0.254717,0.311268,0.735759,0.708861,0.72231
8,sentient,0.781606,0.890823,0.813295,0.803868,0.808582,0.906646,0.901899,0.904272
9,volition,0.746863,0.873418,0.759149,0.708899,0.734024,0.879747,0.85443,0.867089


In [36]:
scores = evaluate_active_only(annotator_model_df)
pd.DataFrame(scores)


Unnamed: 0,property,Kappa_humans,F1_humans,Kappa_model_1,Kappa_model_2,Kappa_model_avg,F1_model_1,F1_model_2,F1_model_avg
0,awareness,0.788189,0.897368,0.7725,0.787752,0.780126,0.889474,0.897368,0.893421
1,change_of_location,0.302569,0.778947,0.21774,0.366043,0.291892,0.792105,0.839474,0.815789
2,change_of_state,0.127477,0.65,0.227442,0.196662,0.212052,0.710526,0.702632,0.706579
3,change_of_possession,0.306569,0.913158,0.316716,0.395866,0.356291,0.923684,0.936842,0.930263
4,existed_after,0.259075,0.855263,0.16849,0.2875,0.227995,0.868421,0.913158,0.890789
5,existed_before,0.416361,0.85,0.457318,0.417178,0.437248,0.860526,0.857895,0.859211
6,existed_during,0.267729,0.960526,0.116866,0.215802,0.166334,0.963158,0.981579,0.972368
7,instigation,0.321521,0.723684,0.380869,0.249668,0.315268,0.755263,0.726316,0.740789
8,sentient,0.78721,0.894737,0.793547,0.803565,0.798556,0.897368,0.902632,0.9
9,volition,0.735147,0.868421,0.749425,0.697975,0.7237,0.876316,0.85,0.863158


In [37]:
scores = evaluate_passive_only(annotator_model_df)
pd.DataFrame(scores)


Unnamed: 0,property,Kappa_humans,F1_humans,Kappa_model_1,Kappa_model_2,Kappa_model_avg,F1_model_1,F1_model_2,F1_model_avg
0,awareness,0.777708,0.888889,0.801562,0.785579,0.793571,0.900794,0.892857,0.896825
1,change_of_location,0.255382,0.777778,0.148649,0.238265,0.193457,0.77381,0.797619,0.785714
2,change_of_state,0.20935,0.68254,0.20935,0.247876,0.228613,0.68254,0.690476,0.686508
3,change_of_possession,0.364407,0.900794,0.417181,0.317536,0.367358,0.916667,0.904762,0.910714
4,existed_after,0.309073,0.869048,0.211604,0.202532,0.207068,0.869048,0.904762,0.886905
5,existed_before,0.454217,0.857143,0.484539,0.366436,0.425487,0.865079,0.84127,0.853175
6,existed_during,0.294776,0.964286,0.155496,0.394231,0.274863,0.960317,0.988095,0.974206
7,instigation,0.313927,0.690476,0.343056,0.25217,0.297613,0.706349,0.68254,0.694444
8,sentient,0.768529,0.884921,0.838988,0.800456,0.819722,0.920635,0.900794,0.910714
9,volition,0.760091,0.880952,0.768529,0.72032,0.744424,0.884921,0.861111,0.873016
