In [None]:
import numpy as np
import pandas as pd
from scipy.stats import spearmanr, pearsonr
from os.path import expanduser
from sklearn import mixture
import itertools
from scipy import linalg
import matplotlib as mpl
mpl.use('agg')
import matplotlib.pyplot as plt
from collections import defaultdict


# Concreteness correlations

In [None]:
home = expanduser('~')
datafile = home + '/Research/protocols/data/noun_raw_data_norm_122218.tsv'

data = pd.read_csv(datafile, sep="\t")

# Split the datasets into train, dev, test
data = data[data['Split'].isin(['train', 'dev'])]

path = home + "/Research/protocols/data/concreteness.tsv"
concreteness = pd.read_csv(path, sep="\t")
list_of_lemmas = concreteness['Word'].values.tolist()

abs_conc = data.groupby('Lemma')['Is.Abstract.Norm'].mean().to_frame().reset_index()
abs_conc['Concreteness'] = abs_conc['Lemma'].map(lambda x: concreteness[concreteness['Word'] == x.lower()]['Conc.M'].values[0] if x.lower() in list_of_lemmas else -1)
ini = len(abs_conc)
abs_conc = abs_conc[abs_conc['Concreteness'] != -1]
print(len(abs_conc) / ini)
print("Spearman correlation: ", np.round(spearmanr(abs_conc['Is.Abstract.Norm'].values, abs_conc['Concreteness'].values)[0], 2))
print("Pearson correlation: ", np.round(pearsonr(abs_conc['Is.Abstract.Norm'].values, abs_conc['Concreteness'].values)[0], 2))

# LCS

In [None]:
path = home + "/Research/protocols/data/verbs-English.lcs"
# lcs = LexicalConceptualStructureLexicon(path)
# with open(home + '/Downloads/lcs.pkl', 'wb') as f:
#     pickle.dump(lcs, f)
# print("Pickled")
with open(home + '/Downloads/lcs.pkl', 'rb') as f:
    lcs = pickle.load(f)
# Read annotations
datafile = home + "/Research/protocols/data/pred_raw_data_norm_122218.tsv"

data = pd.read_csv(datafile, sep="\t")

# Split the datasets into train, dev, test
data = data[data['Split'].isin(['train', 'dev'])]

dyn_lcs = data.groupby('Lemma')['Is.Dynamic.Norm'].apply(list).to_frame().reset_index()
dyn_lcs['lcs'] = dyn_lcs['Lemma'].map(lambda x: lcs.eventive(x.lower()) if x.lower() in lcs.verbs else -1)
num_of_lemmas = len(dyn_lcs)
dyn_lcs = dyn_lcs[dyn_lcs['lcs'] != -1]
dyn_lcs.set_index('Lemma', inplace=True)
dyn_lcs['dyn'] = dyn_lcs['Is.Dynamic.Norm'].apply(lambda x: [a > 0 for a in x])
dyn_lcs['comp'] = dyn_lcs.apply(lambda x: 1 if set(x['dyn']).intersection(set(x['lcs'])) else 0, axis=1)

print("Percentage of lemmas found in lcs database:", len(dyn_lcs) / num_of_lemmas)
print("They share at least one sense:", sum(dyn_lcs['comp']) / len(dyn_lcs))

# Factuality

In [None]:
datafile = home + "/Research/protocols/data/pred_raw_data_norm_122218.tsv"
pred_data = pd.read_csv(datafile, sep="\t")

datafile_ = home + "/Research/protocols/data/it-happened_eng_ud1.2_07092017_normalized.tsv"
fact_data = pd.read_csv(datafile_, sep="\t")

pred_data = pred_data[pred_data['Split'].isin(['train', 'dev'])]
pred_data['Sentence.ID'] = pred_data['Sentence.ID'].str.replace('sent_', '', regex=False)
pred_data['Unique.ID'] = pred_data.apply(lambda x: str(x['Sentence.ID']) + "_" + str(x["Root.Token"]), axis=1)
pred_data = pred_data.groupby('Unique.ID', as_index=False).mean().reset_index(drop=True)

fact_data = fact_data[fact_data['Split'].isin(['train', 'dev'])]
fact_data['Unique.ID'] = fact_data.apply(lambda x: str(x['Sentence.ID']) + "_" + str(x["Pred.Token"] - 1), axis=1)
fact_data = fact_data.groupby('Unique.ID', as_index=False).mean().reset_index(drop=True)

hyp_fact = pred_data.loc[:, ['Unique.ID', 'Is.Hypothetical.Norm', 'Is.Particular.Norm', 'Is.Dynamic.Norm']]
fact_ids = fact_data['Unique.ID'].tolist()
hyp_fact['Happened.Norm'] = hyp_fact['Unique.ID'].apply(lambda x: fact_data[fact_data['Unique.ID'] == x]['Happened.Norm'].iloc[0] if x in fact_ids else None)
hyp_fact2 = hyp_fact.dropna()
print(np.round(len(hyp_fact2) / len(hyp_fact), 2))
# asdf = hyp_fact2[(hyp_fact2['Is.Hypothetical.Norm']>1) & (hyp_fact2['Happened.Norm']<-1)]
for attr in ['Is.Hypothetical.Norm', 'Is.Particular.Norm', 'Is.Dynamic.Norm']:
    print(attr)
    print("Spearman correlation: ", np.round(spearmanr(hyp_fact2[attr].values, hyp_fact2['Happened.Norm'].values)[0], 2))
    print("Pearson correlation: ", np.round(pearsonr(hyp_fact2[attr].values, hyp_fact2['Happened.Norm'].values)[0], 2))

# Plot Bars

In [None]:
x = ['Token', 'Type', 'GloVe', 'ELMO', 'Hand', 'All']
ind = np.arange(len(x))
y_part = [39.2, 51.6, 44.8, 58.2, 54.8, 58.0]
y_kind = [26.0, 42.0, 33.1, 47.9, 45.6, 48.0]
y_abs = [49.2, 34.4, 46.9, 55.8, 51.6, 56.2]

plt.figure()
plt.suptitle('Argument correlation scores')
ax = plt.subplot(111)
# ax.set_xlabel('Model features')
ax.set_ylabel('Pearson correlation')
ax.bar(ind - 0.2, y_part, width=0.2, color='b', align='center', label='Particular')
ax.bar(x, y_kind, width=0.2, color='g', align='center', label='Kind')
ax.bar(ind + 0.2, y_abs, width=0.2, color='r', align='center', label='Abstract')
box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])

# Put a legend to the right of the current axis
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

plt.savefig('bars-arg-pear.png')


x = ['Token', 'Type', 'GloVe', 'ELMO', 'Hand', 'All']
ind = np.arange(len(x))
y_part = [16.3, 20.9, 20.3, 27.5, 24.7, 28.5]
y_hyp = [13.8, 38.3, 22.9, 42.2, 38.8, 42.0]
y_dyn = [33.2, 31.5, 29.4, 38.3, 37.5, 38.8]

plt.figure()
plt.suptitle('Predicate correlation scores')
ax = plt.subplot(111)
# ax.set_xlabel('Model features')
ax.set_ylabel('Pearson correlation')
ax.bar(ind - 0.2, y_part, width=0.2, color='b', align='center', label='Particular')
ax.bar(x, y_hyp, width=0.2, color='g', align='center', label='Hypothetical')
ax.bar(ind + 0.2, y_dyn, width=0.2, color='r', align='center', label='Dynamic')
box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])

# Put a legend to the right of the current axis
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

plt.savefig('bars-pred-pear.png')

x = ['Token', 'Type', 'GloVe', 'ELMO', 'Hand', 'All']
ind = np.arange(len(x))
y_part = [7.9, 13.1, 9.7, 15.7, 13.5, 15.8]
y_kind = [2.1, 8.9, 2.6, 11.6, 10.8, 11.4]
y_abs = [14.1, 6.3, 11.9, 17.3, 15.2, 17.7]

plt.figure()
plt.suptitle('Argument R1 scores')
ax = plt.subplot(111)
ax.set_xlabel('Model features')
ax.set_ylabel('R1')
ax.bar(ind - 0.2, y_part, width=0.2, color='b', align='center', label='Particular')
ax.bar(x, y_kind, width=0.2, color='g', align='center', label='Kind')
ax.bar(ind + 0.2, y_abs, width=0.2, color='r', align='center', label='Abstract')
box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])

# Put a legend to the right of the current axis
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

plt.savefig('bars-arg-r1.png')


x = ['Token', 'Type', 'GloVe', 'ELMO', 'Hand', 'All']
ind = np.arange(len(x))
y_part = [1.2, 2.7, 2.2, 3.9, 3.4, 4.5]
y_hyp = [0, 3.7, 0, 6.0, 2.9, 7.2]
y_dyn = [6.0, 5.4, 4.6, 7.8, 7.9, 8.4]

plt.figure()
plt.suptitle('Predicate R1 scores')
ax = plt.subplot(111)
# ax.set_xlabel('Model features')
ax.set_ylabel('R1')
ax.bar(ind - 0.2, y_part, width=0.2, color='b', align='center', label='Particular')
ax.bar(x, y_hyp, width=0.2, color='g', align='center', label='Hypothetical')
ax.bar(ind + 0.2, y_dyn, width=0.2, color='r', align='center', label='Dynamic')
box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])

# Put a legend to the right of the current axis
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

plt.savefig('bars-pred-r1.png')

# SPR

In [None]:
home = expanduser('~')
attr_map = {"part": "Is.Particular", "kind": "Is.Kind", "abs": "Is.Abstract"}
attr_conf = {"part": "Part.Confidence", "kind": "Kind.Confidence",
         "abs": "Abs.Confidence"}
attrs = ["part", "kind", "abs"]
attributes = ['Is.Particular.Norm', 'Is.Kind.Norm', 'Is.Abstract.Norm']

datafile = home + "/Desktop/protocols/data/noun_raw_data_norm_122218.tsv"
arg = pd.read_csv(datafile, sep="\t")

arg['Sentence.ID'] = arg['Sentence.ID'].str.replace('sent_', '', regex=False)
arg['Unique.ID'] = arg.apply(lambda x: str(x['Sentence.ID']) + "_" + str(x['Span']).split(',')[0] + "_" + str(x['Span']).split(',')[-1], axis=1)
arg = arg.dropna()
arg = arg[arg['Split'].isin(['train', 'dev'])]
arg = arg.groupby('Unique.ID', as_index=True).mean()

datafile_ = home + "/Desktop/protocols/data/spr/protoroles_eng_ud1.2_11082016.tsv"
spr = pd.read_csv(datafile_, sep="\t")
# pred token is 0 indexed in SPR
spr['Unique.ID'] = spr.apply(lambda x: str(x['Sentence.ID']) + "_" + str(x["Arg.Tokens.Begin"]) + "_" + str(x["Arg.Tokens.End"]), axis=1)
spr = spr[~spr['Is.Pilot']]
spr = spr.dropna()

spr = spr[spr['Split'].isin(['train', 'dev'])]

# properties = ['change_of_location', 'instigation', 'partitive', 'was_for_benefit', 'existed_after', 'was_used', 'change_of_possession', 'existed_during', 'sentient', 'volition', 'change_of_state_continuous', 'awareness', 'existed_before', 'change_of_state']
properties = ['volition', 'awareness', 'sentient', 'change_of_location', 'instigation', 'change_of_state', 'was_used', 'change_of_possession', 'partitive', 'was_for_benefit', 'existed_before', 'existed_during', 'existed_after']
# arg_ids = list(set(arg['Unique.ID'].tolist()))
print("Arg\n")
for prop in properties:
    prop_df = spr[spr['Property'] == prop]
    prop_df.loc[:, 'Response.ridit'] = prop_df.groupby('Annotator.ID')['Response'].transform(ridit)
    prop_df = prop_df.groupby('Unique.ID', as_index=False).mean().reset_index(drop=True)
    prop_df = prop_df.loc[:, ['Unique.ID', 'Response.ridit']].dropna()

    for attr in attributes:
        prop_df.loc[:, attr] = prop_df['Unique.ID'].apply(lambda x: arg.loc[x][attr] if x in arg.index else None)
    prop_df = prop_df.dropna()

    print(prop.replace('_', ' '), '&', np.round(pearsonr(prop_df[attributes[0]].values, prop_df['Response.ridit'])[0], 2), '&', np.round(pearsonr(prop_df[attributes[1]].values, prop_df['Response.ridit'])[0], 2), '&', np.round(pearsonr(prop_df[attributes[2]].values, prop_df['Response.ridit'])[0], 2), "\\\\")

attributes = ['Is.Particular.Norm', 'Is.Hypothetical.Norm', 'Is.Dynamic.Norm']
datafile = home + "/Desktop/protocols/data/pred_raw_data_norm_122218.tsv"
pred = pd.read_csv(datafile, sep="\t")
pred['Sentence.ID'] = pred['Sentence.ID'].str.replace('sent_', '', regex=False)

lst_col = 'Context.Span'
x = pred.assign(**{lst_col: pred[lst_col].str.split(';')})
pred = pd.DataFrame({col: np.repeat(x[col].values, x[lst_col].str.len()) for col in x.columns.difference([lst_col])}).assign(**{lst_col: np.concatenate(x[lst_col].values)})[x.columns.tolist()]
pred['Unique.ID'] = pred.apply(lambda x: str(x['Sentence.ID']) + "_" + str(x['Context.Span']).split(',')[0] + "_" + str(x['Context.Span']).split(',')[-1], axis=1)
pred = pred.dropna()
pred = pred[pred['Split'].isin(['train', 'dev'])]
pred = pred.groupby('Unique.ID', as_index=True).mean()

# pred_ids = list(set(pred['Unique.ID'].tolist()))
print("\nPred\n")
for prop in properties:
    prop_df = spr[spr['Property'] == prop]
    prop_df.loc[:, 'Response.ridit'] = prop_df.groupby('Annotator.ID')['Response'].transform(ridit)
    prop_df = prop_df.groupby('Unique.ID', as_index=False).mean().reset_index(drop=True)
    prop_df = prop_df.loc[:, ['Unique.ID', 'Response.ridit']].dropna()
    for attr in attributes:
        prop_df.loc[:, attr] = prop_df['Unique.ID'].apply(lambda x: pred.loc[x][attr] if x in pred.index else None)
    prop_df = prop_df.dropna()

    print(prop.replace('_', ' '), '&', np.round(pearsonr(prop_df[attributes[0]].values, prop_df['Response.ridit'])[0], 2), '&', np.round(pearsonr(prop_df[attributes[1]].values, prop_df['Response.ridit'])[0], 2), '&', np.round(pearsonr(prop_df[attributes[2]].values, prop_df['Response.ridit'])[0], 2), "\\\\")

# Analysis in paper

In [None]:
sigdig = 1
data_arg = pd.read_csv('dev_preds_arg.tsv', sep='\t')
data_pred = pd.read_csv('dev_preds_pred.tsv', sep='\t')

# Do analysis on argument over here
# print(data_arg[data_arg['kind.Pred']>1]['kind.Pred'])
thing_words = data_arg[data_arg['Lemma'].str.contains('thing')]
# things_words_pred = data_pred[data_pred['Lemma'].str.contains('thing')]
# print(set(thing_words["Lemma"].tolist()))

plt.figure()
sns.distplot(thing_words['Is.Particular.Norm'], hist=False, label='Part Ann').get_figure()
sns.distplot(thing_words['part.Pred'], hist=False, label='Part Pred').get_figure()
sns.distplot(thing_words['Is.Kind.Norm'], hist=False, label='Kind Ann').get_figure()
sns.distplot(thing_words['kind.Pred'], hist=False, label='Kind Pred').get_figure()
sns.distplot(thing_words['Is.Abstract.Norm'], hist=False, label='Abs Ann').get_figure()
sns.distplot(thing_words['abs.Pred'], hist=False, label='Abs Pred').get_figure()
plt.xlabel('Normalized score')
plt.savefig('things.png', transparent=True)


print([(a, len(data_arg[data_arg['POS'] == a])) for a in list(set(data_arg['POS'].tolist()))])
# R1 and correlation based on POS and gov_rel
print("\nArg POS")
for pos in list(set(data_arg['POS'].tolist())):
    data_new = data_arg[data_arg['POS'] == pos]
    print(pos, '&', np.round(pearsonr(data_new['Is.Particular.Norm'], data_new['part.Pred'])[0] * 100, sigdig), '&', np.round(r1_score(data_new['Is.Particular.Norm'], data_new['part.Pred']) * 100, sigdig), '&', np.round(pearsonr(data_new['Is.Kind.Norm'], data_new['kind.Pred'])[0] * 100, sigdig), '&', np.round(r1_score(data_new['Is.Kind.Norm'], data_new['kind.Pred']) * 100, sigdig), '&', np.round(pearsonr(data_new['Is.Abstract.Norm'], data_new['abs.Pred'])[0] * 100, sigdig), '&', np.round(r1_score(data_new['Is.Abstract.Norm'], data_new['abs.Pred']) * 100, sigdig), '&', np.round(r1_score(data_new.loc[:, ['Is.Particular.Norm', 'Is.Kind.Norm', 'Is.Abstract.Norm']].values, data_new.loc[:, ['part.Pred', 'kind.Pred', 'abs.Pred']].values) * 100, sigdig), "\\\\")

print([(a, len(data_arg[data_arg['DEPREL'] == a])) for a in list(set(data_arg['DEPREL'].tolist()))])
print("\nArg DEPREL")
# R1 and correlation based on POS and gov_rel
for deprel in list(set(data_arg['DEPREL'].tolist())):
    data_new = data_arg[data_arg['DEPREL'] == deprel]
    print(deprel, '&', np.round(pearsonr(data_new['Is.Particular.Norm'], data_new['part.Pred'])[0] * 100, sigdig), '&', np.round(r1_score(data_new['Is.Particular.Norm'], data_new['part.Pred']) * 100, sigdig), '&', np.round(pearsonr(data_new['Is.Kind.Norm'], data_new['kind.Pred'])[0] * 100, sigdig), '&', np.round(r1_score(data_new['Is.Kind.Norm'], data_new['kind.Pred']) * 100, sigdig), '&', np.round(pearsonr(data_new['Is.Abstract.Norm'], data_new['abs.Pred'])[0] * 100, sigdig), '&', np.round(r1_score(data_new['Is.Abstract.Norm'], data_new['abs.Pred']) * 100, sigdig), '&', np.round(r1_score(data_new.loc[:, ['Is.Particular.Norm', 'Is.Kind.Norm', 'Is.Abstract.Norm']].values, data_new.loc[:, ['part.Pred', 'kind.Pred', 'abs.Pred']].values) * 100, sigdig), "\\\\")


print("\npred POS")
for pos in list(set(data_pred['POS'].tolist())):
    data_new = data_pred[data_pred['POS'] == pos]
    print(pos, '&', np.round(pearsonr(data_new['Is.Particular.Norm'], data_new['part.Pred'])[0] * 100, sigdig), '&', np.round(r1_score(data_new['Is.Particular.Norm'], data_new['part.Pred']) * 100, sigdig), '&', np.round(pearsonr(data_new['Is.Hypothetical.Norm'], data_new['hyp.Pred'])[0] * 100, sigdig), '&', np.round(r1_score(data_new['Is.Hypothetical.Norm'], data_new['hyp.Pred']) * 100, sigdig), '&', np.round(pearsonr(data_new['Is.Dynamic.Norm'], data_new['dyn.Pred'])[0] * 100, sigdig), '&', np.round(r1_score(data_new['Is.Dynamic.Norm'], data_new['dyn.Pred']) * 100, sigdig), '&', np.round(r1_score(data_new.loc[:, ['Is.Particular.Norm', 'Is.Hypothetical.Norm', 'Is.Dynamic.Norm']].values, data_new.loc[:, ['part.Pred', 'hyp.Pred', 'dyn.Pred']].values) * 100, sigdig), "\\\\")

print("\npred DEPREL")
# R1 and correlation based on POS and gov_rel
for deprel in list(set(data_pred['DEPREL'].tolist())):
    data_new = data_pred[data_pred['DEPREL'] == deprel]
    print(deprel, '&', np.round(pearsonr(data_new['Is.Particular.Norm'], data_new['part.Pred'])[0] * 100, sigdig), '&', np.round(r1_score(data_new['Is.Particular.Norm'], data_new['part.Pred']) * 100, sigdig), '&', np.round(pearsonr(data_new['Is.Hypothetical.Norm'], data_new['hyp.Pred'])[0] * 100, sigdig), '&', np.round(r1_score(data_new['Is.Hypothetical.Norm'], data_new['hyp.Pred']) * 100, sigdig), '&', np.round(pearsonr(data_new['Is.Dynamic.Norm'], data_new['dyn.Pred'])[0] * 100, sigdig), '&', np.round(r1_score(data_new['Is.Dynamic.Norm'], data_new['dyn.Pred']) * 100, sigdig), '&', np.round(r1_score(data_new.loc[:, ['Is.Particular.Norm', 'Is.Hypothetical.Norm', 'Is.Dynamic.Norm']].values, data_new.loc[:, ['part.Pred', 'hyp.Pred', 'dyn.Pred']].values) * 100, sigdig), "\\\\")


for pos in list(set(data_arg['POS'].tolist()).intersection()):
    data_new = data_arg[data_arg['POS'] == pos]
    data_new2 = data_pred[data_pred['POS'] == pos]
    print(pos, '&', np.round(pearsonr(data_new['Is.Kind.Norm'], data_new['kind.Pred'])[0] * 100, sigdig), '&', np.round(r1_score(data_new['Is.Kind.Norm'], data_new['kind.Pred']) * 100, sigdig), '&', np.round(pearsonr(data_new['Is.Abstract.Norm'], data_new['abs.Pred'])[0] * 100, sigdig), '&', np.round(r1_score(data_new['Is.Abstract.Norm'], data_new['abs.Pred']) * 100, sigdig), '&', np.round(pearsonr(data_new2['Is.Hypothetical.Norm'], data_new2['hyp.Pred'])[0] * 100, sigdig), '&', np.round(r1_score(data_new2['Is.Hypothetical.Norm'], data_new2['hyp.Pred']) * 100, sigdig), "\\\\")

for deprel in list(set(data_arg['DEPREL'].tolist())):
    data_new = data_arg[data_arg['DEPREL'] == deprel]
    data_new2 = data_pred[data_pred['DEPREL'] == deprel]
    print(deprel, '&', np.round(pearsonr(data_new['Is.Kind.Norm'], data_new['kind.Pred'])[0] * 100, sigdig), '&', np.round(r1_score(data_new['Is.Kind.Norm'], data_new['kind.Pred']) * 100, sigdig), '&', np.round(pearsonr(data_new['Is.Abstract.Norm'], data_new['abs.Pred'])[0] * 100, sigdig), '&', np.round(r1_score(data_new['Is.Abstract.Norm'], data_new['abs.Pred']) * 100, sigdig), '&', np.round(pearsonr(data_new2['Is.Hypothetical.Norm'], data_new2['hyp.Pred'])[0] * 100, sigdig), '&', np.round(r1_score(data_new2['Is.Hypothetical.Norm'], data_new2['hyp.Pred']) * 100, sigdig), "\\\\")


pron_df = data_arg[data_arg['Lemma'].isin(['you', 'they'])]
print(pron_df[pron_df['Is.Kind.Norm'] > 0]['Sentences'])

hyp_df = data_pred[(data_pred['Sentences'].str.contains('if'))]
print(data_pred[(data_pred['Sentences'].str.contains('if ', regex=False)) & (data_pred['hyp.Pred'] < -0.3)]['Sentences'])