In [6]:
import numpy as np
import pandas as pd
from scipy.stats import spearmanr, pearsonr
from os.path import expanduser
from sklearn import mixture
import itertools
from scipy import linalg
import matplotlib as mpl
mpl.use('agg')
import matplotlib.pyplot as plt
from collections import defaultdict
import seaborn as sns
from pprint import pprint
from factslab.utility.lcsreader import LexicalConceptualStructureLexicon
from factslab.utility import ridit, r1_score, dev_mode_group

home = expanduser('~')
%matplotlib inline
pd.set_option('mode.chained_assignment', None)
pd.set_option('display.max_colwidth', -1)

# Load the data

In [7]:
pred_datafile = home + "/Research/protocols/data/pred_raw_data_norm_122218.tsv"
pred_data = pd.read_csv(pred_datafile, sep="\t")
pred_data = pred_data[pred_data['Split'].isin(['train', 'dev'])]
print("Predicate train + dev data shape", pred_data.shape)
attributes_pred = ['Is.Particular.Norm', 'Is.Hypothetical.Norm', 'Is.Dynamic.Norm']


arg_datafile = home + "/Research/protocols/data/noun_raw_data_norm_122218.tsv"
arg_data = pd.read_csv(arg_datafile, sep="\t")
arg_data = arg_data[arg_data['Split'].isin(['train', 'dev'])]
attributes_arg = ['Is.Particular.Norm', 'Is.Kind.Norm', 'Is.Abstract.Norm']
print("Argument train + dev data shape", arg_data.shape)

Predicate train + dev data shape (36549, 29)
Argument train + dev data shape (40883, 27)


# Concreteness correlations

In [9]:
conc_data = arg_data.groupby('Unique.ID', as_index=False).apply(lambda x: dev_mode_group(x, attributes_arg, attr_map, attr_conf, type="regression")).reset_index(drop=True)

path = home + "/Research/protocols/data/concreteness.tsv"
conc = pd.read_csv(path, sep="\t")
list_of_lemmas = conc['Word'].values.tolist()

abs_conc = conc_data.groupby('Lemma')['Is.Abstract.Norm'].mean().to_frame().reset_index()
abs_conc['conc'] = abs_conc['Lemma'].map(lambda x: (conc[conc['Word'] == x.lower()]['Conc.M']).values[0] if x.lower() in list_of_lemmas else -1)

ini = len(abs_conc)
abs_conc = abs_conc[abs_conc['conc'] != -1]
print("Percentage of lemmas found in database:", len(abs_conc) / ini)
print("Spearman correlation: ", np.round(spearmanr(abs_conc['Is.Abstract.Norm'].values, abs_conc['conc'].values)[0], 2))
print("Pearson correlation: ", np.round(pearsonr(abs_conc['Is.Abstract.Norm'].values, abs_conc['conc'].values)[0], 2))

NameError: name 'attributes' is not defined

# LCS

In [None]:
path = home + "/Research/protocols/data/verbs-English.lcs"
lcs = LexicalConceptualStructureLexicon(path)

lcs_data = pred_data.copy()

dyn_lcs = lcs_data.groupby('Lemma')['Is.Dynamic.Norm'].apply(list).to_frame().reset_index()
dyn_lcs['lcs'] = dyn_lcs['Lemma'].map(lambda x: lcs.eventive(x.lower()) if x.lower() in lcs.verbs else -1)
num_of_lemmas = len(dyn_lcs)
dyn_lcs = dyn_lcs[dyn_lcs['lcs'] != -1]
dyn_lcs.set_index('Lemma', inplace=True)
dyn_lcs['dyn'] = dyn_lcs['Is.Dynamic.Norm'].apply(lambda x: [a > 0 for a in x])
dyn_lcs['comp'] = dyn_lcs.apply(lambda x: 1 if set(x['dyn']).intersection(set(x['lcs'])) else 0, axis=1)

print("Percentage of lemmas found in lcs database:", len(dyn_lcs) / num_of_lemmas)
print("They share at least one sense:", sum(dyn_lcs['comp']) / len(dyn_lcs))

# Factuality

In [None]:
datafile_ = home + "/Research/protocols/data/it-happened_eng_ud1.2_07092017_normalized.tsv"
fact_data = pd.read_csv(datafile_, sep="\t")

pred_data_f = pred_data.copy()
pred_data_f['Sentence.ID'] = pred_data_f['Sentence.ID'].str.replace('sent_', '', regex=False)
pred_data_f['Unique.ID'] = pred_data_f.apply(lambda x: str(x['Sentence.ID']) + "_" + str(x["Root.Token"]), axis=1)
pred_data_f = pred_data_f.groupby('Unique.ID', as_index=False).mean().reset_index(drop=True)

fact_data = fact_data[fact_data['Split'].isin(['train', 'dev'])]
fact_data['Unique.ID'] = fact_data.apply(lambda x: str(x['Sentence.ID']) + "_" + str(x["Pred.Token"] - 1), axis=1)
fact_data = fact_data.groupby('Unique.ID', as_index=False).mean().reset_index(drop=True)

hyp_fact = pred_data_f.loc[:, ['Unique.ID', 'Is.Hypothetical.Norm', 'Is.Particular.Norm', 'Is.Dynamic.Norm']]
fact_ids = fact_data['Unique.ID'].tolist()
hyp_fact['Happened.Norm'] = hyp_fact['Unique.ID'].apply(lambda x: fact_data[fact_data['Unique.ID'] == x]['Happened.Norm'].iloc[0] if x in fact_ids else None)
hyp_fact2 = hyp_fact.dropna()
print("Overlap percentage", np.round(len(hyp_fact2) / len(hyp_fact), 2))
# asdf = hyp_fact2[(hyp_fact2['Is.Hypothetical.Norm']>1) & (hyp_fact2['Happened.Norm']<-1)]
for attr in ['Is.Hypothetical.Norm', 'Is.Particular.Norm', 'Is.Dynamic.Norm']:
    print(attr)
    print("Spearman correlation: ", np.round(spearmanr(hyp_fact2[attr].values, hyp_fact2['Happened.Norm'].values)[0], 2))
    print("Pearson correlation: ", np.round(pearsonr(hyp_fact2[attr].values, hyp_fact2['Happened.Norm'].values)[0], 2))

# Plot Bars

In [None]:
x = ['Token', 'Type', 'GloVe', 'ELMO', 'Hand', 'All']
ind = np.arange(len(x))
y_part = [39.2, 51.6, 44.8, 58.2, 54.8, 58.0]
y_kind = [26.0, 42.0, 33.1, 47.9, 45.6, 48.0]
y_abs = [49.2, 34.4, 46.9, 55.8, 51.6, 56.2]

plt.figure()
plt.suptitle('Argument correlation scores')
ax = plt.subplot(111)
# ax.set_xlabel('Model features')
ax.set_ylabel('Pearson correlation')
ax.bar(ind - 0.2, y_part, width=0.2, color='b', align='center', label='Particular')
ax.bar(x, y_kind, width=0.2, color='g', align='center', label='Kind')
ax.bar(ind + 0.2, y_abs, width=0.2, color='r', align='center', label='Abstract')
box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])

# Put a legend to the right of the current axis
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()
# plt.savefig('bars-arg-pear.png')


x = ['Token', 'Type', 'GloVe', 'ELMO', 'Hand', 'All']
ind = np.arange(len(x))
y_part = [16.3, 20.9, 20.3, 27.5, 24.7, 28.5]
y_hyp = [13.8, 38.3, 22.9, 42.2, 38.8, 42.0]
y_dyn = [33.2, 31.5, 29.4, 38.3, 37.5, 38.8]

plt.figure()
plt.suptitle('Predicate correlation scores')
ax = plt.subplot(111)
# ax.set_xlabel('Model features')
ax.set_ylabel('Pearson correlation')
ax.bar(ind - 0.2, y_part, width=0.2, color='b', align='center', label='Particular')
ax.bar(x, y_hyp, width=0.2, color='g', align='center', label='Hypothetical')
ax.bar(ind + 0.2, y_dyn, width=0.2, color='r', align='center', label='Dynamic')
box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])

# Put a legend to the right of the current axis
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()
# plt.savefig('bars-pred-pear.png')


x = ['Token', 'Type', 'GloVe', 'ELMO', 'Hand', 'All']
ind = np.arange(len(x))
y_part = [7.9, 13.1, 9.7, 15.7, 13.5, 15.8]
y_kind = [2.1, 8.9, 2.6, 11.6, 10.8, 11.4]
y_abs = [14.1, 6.3, 11.9, 17.3, 15.2, 17.7]

plt.figure()
plt.suptitle('Argument R1 scores')
ax = plt.subplot(111)
ax.set_xlabel('Model features')
ax.set_ylabel('R1')
ax.bar(ind - 0.2, y_part, width=0.2, color='b', align='center', label='Particular')
ax.bar(x, y_kind, width=0.2, color='g', align='center', label='Kind')
ax.bar(ind + 0.2, y_abs, width=0.2, color='r', align='center', label='Abstract')
box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])

# Put a legend to the right of the current axis
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()
# plt.savefig('bars-arg-r1.png')


x = ['Token', 'Type', 'GloVe', 'ELMO', 'Hand', 'All']
ind = np.arange(len(x))
y_part = [1.2, 2.7, 2.2, 3.9, 3.4, 4.5]
y_hyp = [0, 3.7, 0, 6.0, 2.9, 7.2]
y_dyn = [6.0, 5.4, 4.6, 7.8, 7.9, 8.4]

plt.figure()
plt.suptitle('Predicate R1 scores')
ax = plt.subplot(111)
# ax.set_xlabel('Model features')
ax.set_ylabel('R1')
ax.bar(ind - 0.2, y_part, width=0.2, color='b', align='center', label='Particular')
ax.bar(x, y_hyp, width=0.2, color='g', align='center', label='Hypothetical')
ax.bar(ind + 0.2, y_dyn, width=0.2, color='r', align='center', label='Dynamic')
box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])

# Put a legend to the right of the current axis
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()
# plt.savefig('bars-pred-r1.png')

# SPR

In [None]:
arg_data_spr = arg_data.copy()
arg_data_spr['Sentence.ID'] = arg_data_spr['Sentence.ID'].str.replace('sent_', '', regex=False)
arg_data_spr['Unique.ID'] = arg_data_spr.apply(lambda x: str(x['Sentence.ID']) + "_" + str(x['Span']).split(',')[0] + "_" + str(x['Span']).split(',')[-1], axis=1)
arg_data_spr = arg_data_spr.dropna()
arg_data_spr = arg_data_spr.groupby('Unique.ID', as_index=True).mean()

datafile_ = home + "/Research/protocols/data/spr/protoroles_eng_ud1.2_11082016.tsv"
spr = pd.read_csv(datafile_, sep="\t")
# pred_data token is 0 indexed in SPR
spr['Unique.ID'] = spr.apply(lambda x: str(x['Sentence.ID']) + "_" + str(x["Arg.Tokens.Begin"]) + "_" + str(x["Arg.Tokens.End"]), axis=1)
spr = spr[~spr['Is.Pilot']]
spr = spr.dropna()

spr = spr[spr['Split'].isin(['train', 'dev'])]

properties = ['awareness', 'volition', 'sentient', 'instigation', 'existed_before', 'existed_during', 'existed_after', 'was_for_benefit', 'change_of_location', 'change_of_state', 'was_used', 'change_of_possession', 'partitive']

print("Arg\n")
for prop in properties:
    prop_df = spr[spr['Property'] == prop]
    prop_df.loc[:, 'Response.ridit'] = prop_df.groupby('Annotator.ID')['Response'].transform(ridit)
    prop_df = prop_df.groupby('Unique.ID', as_index=False).mean().reset_index(drop=True)
    prop_df = prop_df.loc[:, ['Unique.ID', 'Response.ridit']].dropna()

    for attr in attributes_arg:
        prop_df.loc[:, attr] = prop_df['Unique.ID'].apply(lambda x: arg_data_spr.loc[x][attr] if x in arg_data_spr.index else None)

    prop_df = prop_df.dropna()
    print(prop.replace('_', ' '), 
          '&', np.round(spearmanr(prop_df[attributes_arg[0]].values, prop_df['Response.ridit'].values)[0], 2), 
          '&', np.round(spearmanr(prop_df[attributes_arg[1]].values, prop_df['Response.ridit'].values)[0], 2), 
          '&', np.round(spearmanr(prop_df[attributes_arg[2]].values, prop_df['Response.ridit'].values)[0], 2), "\\\\")


pred_data_spr = pred_data.copy()
pred_data_spr['Sentence.ID'] = pred_data['Sentence.ID'].str.replace('sent_', '', regex=False)

lst_col = 'Context.Span'
x = pred_data_spr.assign(**{lst_col: pred_data[lst_col].str.split(';')})
pred_data_spr = pd.DataFrame({col: np.repeat(x[col].values, x[lst_col].str.len()) for col in x.columns.difference([lst_col])}).assign(**{lst_col: np.concatenate(x[lst_col].values)})[x.columns.tolist()]
pred_data_spr['Unique.ID'] = pred_data_spr.apply(lambda x: str(x['Sentence.ID']) + "_" + str(x['Context.Span']).split(',')[0] + "_" + str(x['Context.Span']).split(',')[-1], axis=1)
pred_data_spr = pred_data_spr.dropna()
pred_data_spr = pred_data_spr.groupby('Unique.ID', as_index=True).mean()

print("\nPred\n")
for prop in properties:
    prop_df = spr[spr['Property'] == prop]
    prop_df.loc[:, 'Response.ridit'] = prop_df.groupby('Annotator.ID')['Response'].transform(ridit)
    prop_df = prop_df.groupby('Unique.ID', as_index=False).mean().reset_index(drop=True)
    prop_df = prop_df.loc[:, ['Unique.ID', 'Response.ridit']].dropna()
    for attr in attributes_pred:
        prop_df.loc[:, attr] = prop_df['Unique.ID'].apply(lambda x: pred_data_spr.loc[x][attr] if x in pred_data_spr.index else None)
    prop_df = prop_df.dropna()

    print(prop.replace('_', ' '), 
          '&', np.round(spearmanr(prop_df[attributes_pred[0]].values, prop_df['Response.ridit'])[0], 2), 
          '&', np.round(spearmanr(prop_df[attributes_pred[1]].values, prop_df['Response.ridit'])[0], 2), 
          '&', np.round(spearmanr(prop_df[attributes_pred[2]].values, prop_df['Response.ridit'])[0], 2), "\\\\")

# Analysis in paper

In [None]:
sigdig = 1
arg_dev = pd.read_csv('dev_preds_arg.tsv', sep='\t')
pred_dev = pd.read_csv('dev_preds_pred.tsv', sep='\t')

attributes_arg_dev = ['Is.Particular.Pred', 'Is.Kind.Pred', 'Is.Abstract.Pred']
attributes_pred_dev = ['Is.Particular.Pred', 'Is.Hypothetical.Pred', 'Is.Dynamic.Pred']

pron_df = arg_dev[arg_dev['Lemma'].isin(['you', 'they'])]
print("Pronomial sentences containing you/they with high Is.Kind values\n")
print(pron_df[pron_df['Is.Kind.Norm'] > 0]['Sentences'].sample(5), "\n")

hyp_df = pred_dev[(pred_dev['Sentences'].str.contains('if'))]
print("Conditional sentences(with if) with low Is.Hypothetical scores\n")
print(pred_dev[(pred_dev['Sentences'].str.contains('if ', regex=False)) & 
               (pred_dev['Is.Hypothetical.Pred'] < -0.3)]['Sentences'].sample(5), "\n")

## Thing words

In [None]:
mpl.rcParams.update({'font.size': 15})
thing_words = arg_dev[arg_dev['Lemma'].str.contains('thing')]

plt.figure()
sns.distplot(thing_words['Is.Particular.Norm'], hist=False, label='Part Ann').get_figure()
sns.distplot(thing_words['Is.Particular.Pred'], hist=False, label='Part Pred').get_figure()
sns.distplot(thing_words['Is.Kind.Norm'], hist=False, label='Kind Ann').get_figure()
sns.distplot(thing_words['Is.Kind.Pred'], hist=False, label='Kind Pred').get_figure()
sns.distplot(thing_words['Is.Abstract.Norm'], hist=False, label='Abs Ann').get_figure()
sns.distplot(thing_words['Is.Abstract.Pred'], hist=False, label='Abs Pred').get_figure()
plt.xlabel('Normalized score')
plt.show()
# plt.savefig('things.png', transparent=True)

## POS and DEPREL

In [None]:
print("\nArg POS")
pprint([(a, len(arg_dev[arg_dev['POS'] == a])) for a in list(set(arg_dev['POS'].tolist()))])
for pos in list(set(arg_dev['POS'].tolist())):
    data_new = arg_dev[arg_dev['POS'] == pos]
    print(pos, '&', np.round(pearsonr(data_new['Is.Particular.Norm'], data_new['Is.Particular.Pred'])[0] * 100, sigdig), 
          '&', np.round(r1_score(data_new['Is.Particular.Norm'], data_new['Is.Particular.Pred']) * 100, sigdig), 
          '&', np.round(pearsonr(data_new['Is.Kind.Norm'], data_new['Is.Kind.Pred'])[0] * 100, sigdig), 
          '&', np.round(r1_score(data_new['Is.Kind.Norm'], data_new['Is.Kind.Pred']) * 100, sigdig),
          '&', np.round(pearsonr(data_new['Is.Abstract.Norm'], data_new['Is.Abstract.Pred'])[0] * 100, sigdig), 
          '&', np.round(r1_score(data_new['Is.Abstract.Norm'], data_new['Is.Abstract.Pred']) * 100, sigdig),
          '&', np.round(r1_score(data_new.loc[:, ['Is.Particular.Norm', 'Is.Kind.Norm', 'Is.Abstract.Norm']].values, data_new.loc[:, ['Is.Particular.Pred', 'Is.Kind.Pred', 'Is.Abstract.Pred']].values) * 100, sigdig), "\\\\")

print("\nArg DEPREL")
pprint([(a, len(arg_dev[arg_dev['DEPREL'] == a])) for a in list(set(arg_dev['DEPREL'].tolist()))])
for deprel in list(set(arg_dev['DEPREL'].tolist())):
    data_new = arg_dev[arg_dev['DEPREL'] == deprel]
    print(deprel, '&', np.round(pearsonr(data_new['Is.Particular.Norm'], data_new['Is.Particular.Pred'])[0] * 100, sigdig),
          '&', np.round(r1_score(data_new['Is.Particular.Norm'], data_new['Is.Particular.Pred']) * 100, sigdig), 
          '&', np.round(pearsonr(data_new['Is.Kind.Norm'], data_new['Is.Kind.Pred'])[0] * 100, sigdig), 
          '&', np.round(r1_score(data_new['Is.Kind.Norm'], data_new['Is.Kind.Pred']) * 100, sigdig), 
          '&', np.round(pearsonr(data_new['Is.Abstract.Norm'], data_new['Is.Abstract.Pred'])[0] * 100, sigdig), 
          '&', np.round(r1_score(data_new['Is.Abstract.Norm'], data_new['Is.Abstract.Pred']) * 100, sigdig), 
          '&', np.round(r1_score(data_new.loc[:, ['Is.Particular.Norm', 'Is.Kind.Norm', 'Is.Abstract.Norm']].values, data_new.loc[:, ['Is.Particular.Pred', 'Is.Kind.Pred', 'Is.Abstract.Pred']].values) * 100, sigdig), "\\\\")


print("\nPred POS")
pprint([(a, len(pred_dev[pred_dev['POS'] == a])) for a in list(set(pred_dev['POS'].tolist()))])
for pos in list(set(pred_dev['POS'].tolist())):
    data_new = pred_dev[pred_dev['POS'] == pos]
    print(pos, '&', np.round(pearsonr(data_new['Is.Particular.Norm'], data_new['Is.Particular.Pred'])[0] * 100, sigdig), 
          '&', np.round(r1_score(data_new['Is.Particular.Norm'], data_new['Is.Particular.Pred']) * 100, sigdig),
          '&', np.round(pearsonr(data_new['Is.Hypothetical.Norm'], data_new['Is.Hypothetical.Pred'])[0] * 100, sigdig), 
          '&', np.round(r1_score(data_new['Is.Hypothetical.Norm'], data_new['Is.Hypothetical.Pred']) * 100, sigdig), 
          '&', np.round(pearsonr(data_new['Is.Dynamic.Norm'], data_new['Is.Dynamic.Pred'])[0] * 100, sigdig), 
          '&', np.round(r1_score(data_new['Is.Dynamic.Norm'], data_new['Is.Dynamic.Pred']) * 100, sigdig), 
          '&', np.round(r1_score(data_new.loc[:, ['Is.Particular.Norm', 'Is.Hypothetical.Norm', 'Is.Dynamic.Norm']].values, data_new.loc[:, ['Is.Particular.Pred', 'Is.Hypothetical.Pred', 'Is.Dynamic.Pred']].values) * 100, sigdig), "\\\\")

print("\nPred DEPREL")
pprint([(a, len(pred_dev[pred_dev['DEPREL'] == a])) for a in list(set(pred_dev['DEPREL'].tolist()))])
for deprel in list(set(pred_dev['DEPREL'].tolist())):
    data_new = pred_dev[pred_dev['DEPREL'] == deprel]
    print(deprel,
          '&', np.round(pearsonr(data_new['Is.Particular.Norm'], data_new['Is.Particular.Pred'])[0] * 100, sigdig), 
          '&', np.round(r1_score(data_new['Is.Particular.Norm'], data_new['Is.Particular.Pred']) * 100, sigdig), 
          '&', np.round(pearsonr(data_new['Is.Hypothetical.Norm'], data_new['Is.Hypothetical.Pred'])[0] * 100, sigdig), 
          '&', np.round(r1_score(data_new['Is.Hypothetical.Norm'], data_new['Is.Hypothetical.Pred']) * 100, sigdig),
          '&', np.round(pearsonr(data_new['Is.Dynamic.Norm'], data_new['Is.Dynamic.Pred'])[0] * 100, sigdig),
          '&', np.round(r1_score(data_new['Is.Dynamic.Norm'], data_new['Is.Dynamic.Pred']) * 100, sigdig), 
          '&', np.round(r1_score(data_new.loc[:, ['Is.Particular.Norm', 'Is.Hypothetical.Norm', 'Is.Dynamic.Norm']].values, data_new.loc[:, ['Is.Particular.Pred', 'Is.Hypothetical.Pred', 'Is.Dynamic.Pred']].values) * 100, sigdig), "\\\\")


# for pos in list(set(arg_dev['POS'].tolist()).intersection(set(pred_dev['POS'].tolist()))):
#     data_new = arg_dev[arg_dev['POS'] == pos]
#     data_new2 = pred_dev[pred_dev['POS'] == pos]
#     print(pos, 
#           '&', np.round(pearsonr(data_new['Is.Kind.Norm'], data_new['Is.Kind.Pred'])[0] * 100, sigdig), 
#           '&', np.round(r1_score(data_new['Is.Kind.Norm'], data_new['Is.Kind.Pred']) * 100, sigdig), 
#           '&', np.round(pearsonr(data_new['Is.Abstract.Norm'], data_new['Is.Abstract.Pred'])[0] * 100, sigdig), 
#           '&', np.round(r1_score(data_new['Is.Abstract.Norm'], data_new['Is.Abstract.Pred']) * 100, sigdig), 
#           '&', np.round(pearsonr(data_new2['Is.Hypothetical.Norm'], data_new2['Is.Hypothetical.Pred'])[0] * 100, sigdig), 
#           '&', np.round(r1_score(data_new2['Is.Hypothetical.Norm'], data_new2['Is.Hypothetical.Pred']) * 100, sigdig), "\\\\")

# for deprel in list(set(arg_dev['POS'].tolist()).intersection(set(pred_dev['POS'].tolist()))):
#     data_new = arg_dev[arg_dev['DEPREL'] == deprel]
#     data_new2 = pred_dev[pred_dev['DEPREL'] == deprel]
#     print(deprel, 
#           '&', np.round(pearsonr(data_new['Is.Kind.Norm'], data_new['Is.Kind.Pred'])[0] * 100, sigdig), 
#           '&', np.round(r1_score(data_new['Is.Kind.Norm'], data_new['Is.Kind.Pred']) * 100, sigdig), 
#           '&', np.round(pearsonr(data_new['Is.Abstract.Norm'], data_new['Is.Abstract.Pred'])[0] * 100, sigdig), 
#           '&', np.round(r1_score(data_new['Is.Abstract.Norm'], data_new['Is.Abstract.Pred']) * 100, sigdig), 
#           '&', np.round(pearsonr(data_new2['Is.Hypothetical.Norm'], data_new2['Is.Hypothetical.Pred'])[0] * 100, sigdig), 
#           '&', np.round(r1_score(data_new2['Is.Hypothetical.Norm'], data_new2['Is.Hypothetical.Pred']) * 100, sigdig), "\\\\")



In [None]:
pred_dev[(pred_dev['Is.Particular.Norm']<-0.2) & (pred_dev['Is.Hypothetical.Norm']<-0.2)][['Unique.ID', 'Sentences']].sample(5)

In [None]:
 
print(np.round(pearsonr(pred_dev['Is.Particular.Norm'], pred_dev['Is.Hypothetical.Norm'])[0] * 100, sigdig))
print(np.round(pearsonr(pred_dev['Is.Particular.Pred'], pred_dev['Is.Hypothetical.Pred'])[0] * 100, sigdig))
print(np.round(pearsonr(pred_dev['Is.Particular.Norm'], pred_dev['Is.Hypothetical.Pred'])[0] * 100, sigdig))
print(np.round(pearsonr(pred_dev['Is.Particular.Pred'], pred_dev['Is.Hypothetical.Norm'])[0] * 100, sigdig))

In [None]:
sigdig=3
def create_corr_df(df, attributes):
    '''
        Creates a dataframe of correlations among attributes in df
    '''
    df_corr = {}
    for attr in attributes:
        df_corr[attr] = {}
        for attr1 in attributes:
            df_corr[attr][attr1] = np.round(pearsonr(df[attr], df[attr1])[0], sigdig)
    return pd.DataFrame(df_corr)

def create_dist(df, attributes):
    '''
        Finds mean, median, variance of each attribute in df
    '''
    df_props = {}
    for attr in attributes:
        df_props[attr] = {}
        df_props[attr]['mean'] = np.round(np.mean(df[attr]), sigdig)
        df_props[attr]['median'] = np.round(np.median(df[attr]), sigdig) 
        df_props[attr]['var'] = np.round(np.var(df[attr]), sigdig)
    return df_props

## Argument annotations

In [None]:
print(arg_data.shape)
display(pd.DataFrame(create_corr_df(arg_data, attributes_arg)))
display(pd.DataFrame(create_dist(arg_data, attributes_arg)))

fig, ax = plt.subplots(figsize=[20, 5], nrows=1, ncols=3, squeeze=False, sharey='row')
for i in range(3):
    sns.distplot(arg_data[attributes_arg[i]], ax=ax[0][i]).get_figure()
plt.show()


## Argument predictions

In [None]:
display(pd.DataFrame(create_corr_df(arg_dev, attributes_arg_dev)))
display(pd.DataFrame(create_dist(arg_dev, attributes_arg_dev)))

fig, ax = plt.subplots(figsize=[20, 5], nrows=1, ncols=3, squeeze=False, sharey='row')
for i in range(3):
    sns.distplot(arg_dev[attributes_arg_dev[i]], ax=ax[0][i]).get_figure()
plt.show()

## Predicate annotations

In [None]:
display(pd.DataFrame(create_corr_df(pred_data, attributes_pred)))
display(pd.DataFrame(create_dist(pred_data, attributes_pred)))

fig, ax = plt.subplots(figsize=[20, 5], nrows=1, ncols=3, squeeze=False, sharey='row')
for i in range(3):
    sns.distplot(pred_data[attributes_pred[i]], ax=ax[0][i]).get_figure()
plt.show()

## Predicate predictions

In [None]:
display(pd.DataFrame(create_corr_df(pred_dev, attributes_pred_dev)))
display(pd.DataFrame(create_dist(pred_dev, attributes_pred_dev)))

fig, ax = plt.subplots(figsize=[20, 5], nrows=1, ncols=3, squeeze=False, sharey='row')
for i in range(3):
    sns.distplot(pred_dev[attributes_pred_dev[i]], ax=ax[0][i]).get_figure()
plt.show()

In [None]:

display(arg_dev[(arg_dev['Is.Particular.Norm']<-0.2) & (arg_dev['Is.Abstract.Norm']<-0.2)][['Unique.ID', 'Sentences', 'Word', 'Is.Particular.Norm', 'Is.Particular.Pred', 'Is.Abstract.Norm', 'Is.Abstract.Pred', 'Is.Kind.Norm', 'Is.Kind.Pred']].sort_values(by=['Is.Particular.Pred', 'Is.Abstract.Pred'], ascending=False))

In [None]:
upper = arg_dev[(arg_dev['POS']=='PROPN')]

fig, ax = plt.subplots(figsize=[20, 5], nrows=1, ncols=3, squeeze=False, sharey='row')
for i in range(3):
    sns.distplot(upper[attributes_arg[i]], ax=ax[0][i]).get_figure()
plt.show()


In [None]:
pd.set_option('display.max_rows', None)
display(pred_dev[(pred_dev['Is.Particular.Pred']>0.1) & (pred_dev['Is.Dynamic.Pred']>0.1)][['Unique.ID', 'Sentences', 'Word', 'Is.Particular.Norm', 'Is.Particular.Pred', 'Is.Hypothetical.Norm', 'Is.Hypothetical.Pred', 'Is.Dynamic.Norm', 'Is.Dynamic.Pred']].sort_values(by=['Is.Particular.Pred', 'Is.Dynamic.Pred'], ascending=False))

## Clausal versus other DEPREL

In [None]:
clause_deprel = ['csubj', 'ccomp', 'xcomp', 'advcl', 'acl']
other_deprel = ['root', 'conj', 'parataxis']

pprint([(a, len(pred_dev[pred_dev['DEPREL'].isin(a)])) for a in [clause_deprel, other_deprel]])

for deprel_set in [clause_deprel, other_deprel]:
    data_new = pred_dev[pred_dev['DEPREL'].isin(deprel_set)]
    print('&', np.round(pearsonr(data_new['Is.Particular.Norm'], data_new['Is.Particular.Pred'])[0] * 100, sigdig), 
          '&', np.round(r1_score(data_new['Is.Particular.Norm'], data_new['Is.Particular.Pred']) * 100, sigdig), 
          '&', np.round(pearsonr(data_new['Is.Hypothetical.Norm'], data_new['Is.Hypothetical.Pred'])[0] * 100, sigdig), 
          '&', np.round(r1_score(data_new['Is.Hypothetical.Norm'], data_new['Is.Hypothetical.Pred']) * 100, sigdig),
          '&', np.round(pearsonr(data_new['Is.Dynamic.Norm'], data_new['Is.Dynamic.Pred'])[0] * 100, sigdig),
          '&', np.round(r1_score(data_new['Is.Dynamic.Norm'], data_new['Is.Dynamic.Pred']) * 100, sigdig), 
          '&', np.round(r1_score(data_new.loc[:, ['Is.Particular.Norm', 'Is.Hypothetical.Norm', 'Is.Dynamic.Norm']].values, data_new.loc[:, ['Is.Particular.Pred', 'Is.Hypothetical.Pred', 'Is.Dynamic.Pred']].values) * 100, sigdig), "\\\\")
print(np.round(pearsonr(pred_dev['Is.Particular.Norm'], pred_dev['Is.Particular.Pred'])[0] * 100, sigdig), '&', np.round(r1_score(pred_dev['Is.Particular.Norm'], pred_dev['Is.Particular.Pred']) * 100, sigdig))