In [1]:
import numpy as np
import pandas as pd
from scipy.stats import spearmanr, pearsonr
from os.path import expanduser
from sklearn import mixture
import itertools
from scipy import linalg
import matplotlib as mpl
mpl.use('agg')
import matplotlib.pyplot as plt
from collections import defaultdict
import seaborn as sns
from pprint import pprint
from factslab.utility.lcsreader import LexicalConceptualStructureLexicon
from factslab.utility import ridit, r1_score, dev_mode_group

home = expanduser('~')
%matplotlib inline
pd.set_option('mode.chained_assignment', None)
pd.set_option('display.max_colwidth', -1)

# Load the data

In [2]:
attributes_pred = ['Is.Particular.Norm', 'Is.Hypothetical.Norm', 'Is.Dynamic.Norm']
pred_datafile = home + "/Research/protocols/data/pred_raw_data_norm_122218.tsv"
pred_data = pd.read_csv(pred_datafile, sep="\t")
pred_data = pred_data[pred_data['Split'].isin(['train', 'dev'])]
pred_data_unq = pred_data.groupby('Unique.ID', as_index=False).apply(lambda x: dev_mode_group(x, attributes_pred, type="regression")).reset_index(drop=True)
print("Predicate train + dev data unique shape", pred_data.shape)

attributes_arg = ['Is.Particular.Norm', 'Is.Kind.Norm', 'Is.Abstract.Norm']
arg_datafile = home + "/Research/protocols/data/noun_raw_data_norm_122218.tsv"
arg_data = pd.read_csv(arg_datafile, sep="\t")
arg_data = arg_data[arg_data['Split'].isin(['train', 'dev'])]
arg_data_unq = arg_data.groupby('Unique.ID', as_index=False).apply(lambda x: dev_mode_group(x, attributes_arg, type="regression")).reset_index(drop=True)
print("Argument train + dev data unique shape", arg_data.shape)

Predicate train + dev data unique shape (36549, 29)
Argument train + dev data unique shape (40883, 27)


# Concreteness correlations

In [3]:
conc_data = arg_data_unq.copy()

path = home + "/Research/protocols/data/concreteness.tsv"
conc = pd.read_csv(path, sep="\t")
list_of_lemmas = conc['Word'].values.tolist()

abs_conc = conc_data.groupby('Lemma')['Is.Abstract.Norm'].mean().to_frame().reset_index()
abs_conc['conc'] = abs_conc['Lemma'].map(lambda x: (conc[conc['Word'] == x.lower()]['Conc.M']).values[0] if x.lower() in list_of_lemmas else -1)

ini = len(abs_conc)
abs_conc = abs_conc[abs_conc['conc'] != -1]
print("Percentage of lemmas found in database:", len(abs_conc) / ini)
print("Spearman correlation: ", np.round(spearmanr(abs_conc['Is.Abstract.Norm'].values, abs_conc['conc'].values)[0], 2))
print("Pearson correlation: ", np.round(pearsonr(abs_conc['Is.Abstract.Norm'].values, abs_conc['conc'].values)[0], 2))

Percentage of lemmas found in database: 0.6633102580239144
Spearman correlation:  -0.45
Pearson correlation:  -0.45


# LCS

In [4]:
path = home + "/Research/protocols/data/verbs-English.lcs"
lcs = LexicalConceptualStructureLexicon(path)

lcs_data = pred_data_unq.copy()

dyn_lcs = lcs_data.groupby('Lemma')['Is.Dynamic.Norm'].apply(list).to_frame().reset_index()
dyn_lcs['lcs'] = dyn_lcs['Lemma'].map(lambda x: lcs.eventive(x.lower()) if x.lower() in lcs.verbs else -1)
num_of_lemmas = len(dyn_lcs)
dyn_lcs = dyn_lcs[dyn_lcs['lcs'] != -1]
dyn_lcs.set_index('Lemma', inplace=True)
dyn_lcs['dyn'] = dyn_lcs['Is.Dynamic.Norm'].apply(lambda x: [a > 0 for a in x])
dyn_lcs['comp'] = dyn_lcs.apply(lambda x: 1 if set(x['dyn']).intersection(set(x['lcs'])) else 0, axis=1)

print("Percentage of lemmas found in lcs database:", len(dyn_lcs) / num_of_lemmas)
print("They share at least one sense:", sum(dyn_lcs['comp']) / len(dyn_lcs))

Percentage of lemmas found in lcs database: 0.43727794479365945
They share at least one sense: 0.861875


In [9]:
dyn_lcs['comp'] = dyn_lcs.apply(lambda x: 1 if set(x['dyn']).issubset(set(x['lcs'])) else 0, axis=1)

print("Percentage of lemmas found in lcs database:", len(dyn_lcs) / num_of_lemmas)
print("They share at least one sense:", sum(dyn_lcs['comp']) / len(dyn_lcs))

Percentage of lemmas found in lcs database: 0.43727794479365945
They share at least one sense: 0.396875


In [15]:
from sklearn.metrics import matthews_corrcoef

dyn_lcs['lcs_event'] = dyn_lcs['lcs'].map(lambda x: True in x)
dyn_lcs['udsg_event'] = dyn_lcs['dyn'].map(lambda x: True in x)

dyn_lcs['lcs_state'] = dyn_lcs['lcs'].map(lambda x: False in x)
dyn_lcs['udsg_state'] = dyn_lcs['dyn'].map(lambda x: False in x)

(dyn_lcs['lcs_event']==dyn_lcs['udsg_event']).sum()/num_of_lemmas, (dyn_lcs['lcs_state']==dyn_lcs['udsg_state']).sum()/num_of_lemmas


(0.35528833014484834, 0.1779174637879202)

In [27]:
display(dyn_lcs[(dyn_lcs['lcs_event']==True) & (dyn_lcs['udsg_event']==True) & (dyn_lcs['lcs_state']==True) & (dyn_lcs['udsg_state']==True)])

Unnamed: 0_level_0,Is.Dynamic.Norm,lcs,dyn,comp,lcs_event,udsg_event,lcs_state,udsg_state
Lemma,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
accept,"[0.14989181921041672, -0.09531629069819296, 0.7661451062858519, 1.063444222719334, 0.8100125695250876, 1.145753525941472, 0.6436138864523061, 0.9405894815742868, -0.3704511348881617, 0.5246988627844292, -0.5795089035135333, -0.3629648163281317, 1.1296925552522403, 1.0979930832358349, -1.0731368082336958, 0.9047283825552058, 1.5261923537454258, 1.7603946016207612, 1.7090099423734073, -1.836711070486146, 1.263555611620163, 1.3571135258582383, 0.6351565245211184, 1.5250541027391542, 1.405233602749987, 1.0130610154618749, 1.4044028822110397, -0.8415081924030048, -0.6571209636317346]","[True, False, False, True]","[True, False, True, True, True, True, True, True, False, True, False, False, True, True, False, True, True, True, True, False, True, True, True, True, True, True, True, False, False]",1,True,True,True,True
acknowledge,"[0.5685117846287859, 0.7970149410374195, -0.9183331645834192, 0.3351680313823737, 0.6583967746634813]","[False, False, False, True, True]","[True, True, False, True, True]",1,True,True,True,True
admit,"[0.2367839177782213, 0.5178274273559303, 2.0840013215706787, -0.8443952770479536, 0.4703849287920591, -0.6928995481420592, -1.2672012074092045, 0.3501813483276576, 0.7655263789399682, 1.1079410530035532, 1.003463080618795, 1.2877816141868408, -0.9418973373401828, 0.4220795785332815, 2.078592380257972, 0.7975638122168168, -1.1730562154056998]","[False, True, True]","[True, True, True, False, True, False, False, True, True, True, True, True, False, True, True, True, False]",1,True,True,True,True
adopt,"[-1.4239377945606122, 0.5259356889061672, 0.29163734842979844, 0.5231286077271492, 0.7135634623930612, 0.9379730143548628]","[False, True, True, True]","[False, True, True, True, True, True]",1,True,True,True,True
allow,"[-0.292829964696834, -0.5365488473855263, 0.5825392226546018, -0.06807342143866632, 0.4727884864715322, 0.4129780467456997, 1.2759180733332094, 0.5572130450728608, 0.603186672541811, -0.8029801917744894, -0.8624150460128447, 1.5879970157590226, 0.22068684532799154, -0.0966444758464715, -0.7060602565112364, -1.3507691123627048, 1.0608196094219522, 0.6256911639254159, -0.7767394869422323, -0.9539147125068416, 0.0007798419065501022, -0.272964235435603, 0.7093711273654915, 0.3531318848450519, -1.0210627276727402, 0.4475700833764712, -1.2891623922235458, 0.4883841700830048, 0.98354643130078, -0.9963194597438664, -0.7374146908924728, 1.89027125883956, 0.3472784596513207, 0.32122757306845634, 0.3730888625702919, 0.499384597183289]","[False, True, True]","[False, False, True, False, True, True, True, True, True, False, False, True, True, False, False, False, True, True, False, False, True, False, True, True, False, True, False, True, True, False, False, True, True, True, True, True]",1,True,True,True,True
appear,"[0.16211556429899127, -1.751232422848522, 0.9010501625917232, -0.6584144239756262, 0.09076178362630903, 1.4922945725993708, -0.2816057019573329, -0.3880806204051061, -1.2730236615098125, 0.7806023453228644, -0.20286854090923745, -0.8762667100305649, -1.0523021116804858, -0.2386708427757693, 0.7987534167294745, 1.0652494016285965, -0.953842746589596, -1.0089808674388876, 0.7165705442045796, -0.3139252257695089, 0.4910403711450248, -0.9717056922380832]","[True, False, True]","[True, False, True, False, True, True, False, False, False, True, False, False, False, False, True, True, False, False, True, False, True, False]",1,True,True,True,True
appreciate,"[0.7272724712051266, -0.4584195215961501, -0.3738782099342359, 0.4445408990451611, -0.3891085163315997, 0.014023341001231496, -0.5334565518089514, 0.44803575730328543, 0.6198542423754453, -0.8904347400696773, 0.7267404165102818, 0.9362147980691856, 1.5156698532621509, -0.7397698775707721, -0.28765211052542944, 0.0455738396773324, 1.7329810353575912, -0.8970325096489405, -0.36683541227197897, -1.0611382945438042, -0.14151849462810873, -2.3250099815170544, 1.7872993841907314, -0.20194847026127227, 0.6673447190107226, -1.361097457391307, 0.3434186189543707, 1.025369817637661, -1.1943470177148687, -0.8626765939033396, 1.5590049075875656, 0.5364548126074015, -0.3219753660406113, -1.274752797122665]","[False, False, True, False, False, False, False, True]","[True, False, False, True, False, True, False, True, True, False, True, True, True, False, False, True, True, False, False, False, False, False, True, False, True, False, True, True, False, False, True, True, False, False]",1,True,True,True,True
be,"[-0.6166212008984907, -1.5975218129150714, 0.2222149126718259, -0.4018428512264847, -1.4895240250654933, -1.7856087092294601, -1.95999517012534, 0.07801257048727718, -0.7778475934661065, -0.035400134638060105, -1.2536219716253418, -0.8844044518276154, -0.622430219750184, -0.5706605481256124, -0.6717469126475287, -0.3726497706933107, -0.8486369958638149, -0.3287544514263096, -0.7431773834808894, -1.6193509453057289, -1.5871125792595298, -1.5923680649951633, -0.9322686037285338, -0.9768449001750134, -0.3466861558279253, 0.5432643851565977, -0.08085218264974277, 0.3931506426476334, -0.5946565385967234, -1.842921825572602, 0.1420755684833636, -0.1401767033787178, -1.0870441923356349, -1.0215400799809329, 0.27866402297056864, -1.4955108019327081, -2.008217530816244, -2.0877190084984205, -0.8224979220170084, -0.3838746206697092, -1.2723695088900735, -1.2792170283567768, -1.1186986864209565, -1.1520073161226945, -0.6181169129560136, 0.7950109428142943, -1.10046195856722, -0.13266065394855692, -0.5789801035690282, -0.5164657538296525, -0.4649458754672155, -0.006673023068057504, -0.8312386645283792, -0.6479011414349387, -0.6778151520627111, 0.1816701716611897, -0.07010248117482844, -1.4019265262617502, -0.016738005291403638, -0.8191996619112462, -0.2598158460766697, 0.15427105222086235, -1.5370679556573774, -0.7887271439551004, -0.601144247312796, -0.049515990352782015, -1.315916222282366, -0.9175421546926996, 2.9584399501124303, 0.4836935421861643, -0.9740136836175468, -0.672566602718402, -1.3877002575882698, -0.04757935832138025, 0.2919052520341213, -0.9409660777082959, -1.0557612355451822, 1.4999908470754562, -0.08615579706339495, -0.8890081013157319, 0.8680954866668366, -0.4760732919814002, -1.0054722206078606, 0.5054649277717544, -1.9323668446616038, -1.7609734712139622, -1.0176307742710569, -1.5844826506987042, -0.8001857102450827, -0.3872233927142886, -0.3922890387665808, 0.2532315762944141, -0.5607843834251327, 0.10612740832014568, 0.7462650556240957, -0.25917044578231785, -0.8774647303309523, -0.9116199561393611, 0.1747018274779164, -0.5178084589600574, ...]","[False, False, False, True, False, False, False, False, False, False, False, False]","[False, False, True, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, True, False, False, False, True, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, True, True, False, False, False, False, True, False, False, True, False, False, True, False, False, True, False, False, False, False, False, False, False, True, False, True, True, False, False, False, True, False, ...]",1,True,True,True,True
bear,"[-0.3314271102155025, -0.3859428028750277, -1.029840287214616, 0.4930374803573547, 0.3026868945346603, -0.4234155464133128, 1.05204128254713, 1.3040760493364667, 0.09805998238518583, 1.1885085221635248, -1.521479108095055, 1.8020455193041176, -0.4207860297657036, 0.5483790962739168, 1.5100379076801262, 1.427702853302369, 1.5777052366207966, 0.8821931977336227, 0.8172050593061021, 1.2219559723705116, 1.2225938841387574, -0.23123655595610956, 1.6717836808002688, -1.831437412641321, 1.1376664830011187, 1.1383581141824854]","[True, False, False]","[False, False, False, True, True, False, True, True, True, True, False, True, False, True, True, True, True, True, True, True, True, False, True, False, True, True]",1,True,True,True,True
believe,"[0.5189921941673827, -0.12552532282434034, -0.6380872208805485, -0.1835777932132696, 1.44811986191206, 1.2346072877436882, 0.4627170471143787, -1.283082585264431, 0.8994529271242384, 0.5891456910591869, -1.1386541286751741, 0.4116298588432151, 1.128624555074437, -0.19784813198249654, 0.8162824881477362, -0.4964881973044021, 0.7037681459615298, 1.3181659467982965, -0.5730322441203839, 1.0992149244642808, 1.3085251710177752, -0.3545793440615562, -1.2977351194016409, -0.7501832460154315, -1.083029347155212, -1.3383873009009422, -1.986748537822827, -0.11270861586765699, -0.5311891378231997, -0.3327421710325271, -1.05568619163268, -0.2816751111323319, -1.072503726007676, -1.6021548598868007, 1.0631980752579997, -1.5384973698260906, 1.0016535951222207, -1.209041096372325, -1.0216051274132854, -0.03138107228530557, 1.2159018158897623, -0.4987220971886794, -1.413767046527094, -1.0282601477239313, 0.3626425785081828, -1.0478022862390741, -1.087799166656509, -0.5201761047865879, 0.5213053821445248, -1.0756145262610228, -2.384154355201056, -1.0583370196967865, -0.26504676846653685, 1.6302897954145297, -0.3207464416904585, -0.17312076834949636, -1.2391050478931451, -0.3642629488360235, -1.038457795336559, 1.431051923417863, -0.9086785430975266, -1.5827172673957468, 0.7378802442057735, 0.2681166810865908, -0.7153863641030576, -1.2518273290288962, -0.9251326197720464, -1.355059287089954, 0.30517902934462265, -0.8279478294498233, 0.21710219636995493, 1.6657209340223649, -0.9635077821045996, -1.2085267714489571, 0.31239957726950845, 1.6534982620297312, -1.3451386864518895, -1.2068422823470664, 0.98192129836265, 1.6338961679064408, 1.218245470680711, 1.1610949759304947, 1.0283440961374586, -0.25566094161352665, 0.7147330670797345, -0.922554348996744, -1.1060746519375504, -0.28638892508083835, -0.6239088494083905, -0.8060424520094973, -1.5526463196164613, -1.1737515978975526]","[False, False, False, False, True, True]","[True, False, False, False, True, True, True, False, True, True, False, True, True, False, True, False, True, True, False, True, True, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False, False, True, False, False, False, True, False, False, False, True, False, False, False, False, True, False, False, False, False, False, True, False, False, True, True, False, False, False, False, True, False, True, True, False, False, True, True, False, False, True, True, True, True, True, False, True, False, False, False, False, False, False, False]",1,True,True,True,True


# Factuality

In [None]:
datafile_ = home + "/Research/protocols/data/it-happened_eng_ud1.2_07092017_normalized.tsv"
fact_data = pd.read_csv(datafile_, sep="\t")

pred_data_f = pred_data_unq.copy()
pred_data_f['Sentence.ID'] = pred_data_f['Sentence.ID'].str.replace('sent_', '', regex=False)
pred_data_f['Unique.ID'] = pred_data_f.apply(lambda x: str(x['Sentence.ID']) + "_" + str(x["Root.Token"]), axis=1)
pred_data_f = pred_data_f.groupby('Unique.ID', as_index=False).apply(lambda x: dev_mode_group(x, attributes_pred, type="regression")).reset_index(drop=True)

fact_data = fact_data[fact_data['Split'].isin(['train', 'dev'])]
fact_data['Unique.ID'] = fact_data.apply(lambda x: str(x['Sentence.ID']) + "_" + str(x["Pred.Token"] - 1), axis=1)
fact_data = fact_data.groupby('Unique.ID', as_index=False).mean().reset_index(drop=True)

hyp_fact = pred_data_f.loc[:, ['Unique.ID', 'Is.Hypothetical.Norm', 'Is.Particular.Norm', 'Is.Dynamic.Norm']]
fact_ids = fact_data['Unique.ID'].tolist()
hyp_fact['Happened.Norm'] = hyp_fact['Unique.ID'].apply(lambda x: fact_data[fact_data['Unique.ID'] == x]['Happened.Norm'].iloc[0] if x in fact_ids else None)
hyp_fact2 = hyp_fact.dropna()
print("Overlap percentage", np.round(len(hyp_fact2) / len(hyp_fact), 2))
# asdf = hyp_fact2[(hyp_fact2['Is.Hypothetical.Norm']>1) & (hyp_fact2['Happened.Norm']<-1)]
for attr in ['Is.Hypothetical.Norm', 'Is.Particular.Norm', 'Is.Dynamic.Norm']:
    print(attr)
    print("Spearman correlation: ", np.round(spearmanr(hyp_fact2[attr].values, hyp_fact2['Happened.Norm'].values)[0], 2))
    print("Pearson correlation: ", np.round(pearsonr(hyp_fact2[attr].values, hyp_fact2['Happened.Norm'].values)[0], 2))

# Plot Bars

In [None]:
x = ['Token', 'Type', 'GloVe', 'ELMO', 'Hand', 'All']
ind = np.arange(len(x))
y_part = [39.2, 51.6, 44.8, 58.2, 54.8, 58.0]
y_kind = [26.0, 42.0, 33.1, 47.9, 45.6, 48.0]
y_abs = [49.2, 34.4, 46.9, 55.8, 51.6, 56.2]

plt.figure()
plt.suptitle('Argument correlation scores')
ax = plt.subplot(111)
# ax.set_xlabel('Model features')
ax.set_ylabel('Pearson correlation')
ax.bar(ind - 0.2, y_part, width=0.2, color='b', align='center', label='Particular')
ax.bar(x, y_kind, width=0.2, color='g', align='center', label='Kind')
ax.bar(ind + 0.2, y_abs, width=0.2, color='r', align='center', label='Abstract')
box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])

# Put a legend to the right of the current axis
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()
# plt.savefig('bars-arg-pear.png')


x = ['Token', 'Type', 'GloVe', 'ELMO', 'Hand', 'All']
ind = np.arange(len(x))
y_part = [16.3, 20.9, 20.3, 27.5, 24.7, 28.5]
y_hyp = [13.8, 38.3, 22.9, 42.2, 38.8, 42.0]
y_dyn = [33.2, 31.5, 29.4, 38.3, 37.5, 38.8]

plt.figure()
plt.suptitle('Predicate correlation scores')
ax = plt.subplot(111)
# ax.set_xlabel('Model features')
ax.set_ylabel('Pearson correlation')
ax.bar(ind - 0.2, y_part, width=0.2, color='b', align='center', label='Particular')
ax.bar(x, y_hyp, width=0.2, color='g', align='center', label='Hypothetical')
ax.bar(ind + 0.2, y_dyn, width=0.2, color='r', align='center', label='Dynamic')
box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])

# Put a legend to the right of the current axis
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()
# plt.savefig('bars-pred-pear.png')


x = ['Token', 'Type', 'GloVe', 'ELMO', 'Hand', 'All']
ind = np.arange(len(x))
y_part = [7.9, 13.1, 9.7, 15.7, 13.5, 15.8]
y_kind = [2.1, 8.9, 2.6, 11.6, 10.8, 11.4]
y_abs = [14.1, 6.3, 11.9, 17.3, 15.2, 17.7]

plt.figure()
plt.suptitle('Argument R1 scores')
ax = plt.subplot(111)
ax.set_xlabel('Model features')
ax.set_ylabel('R1')
ax.bar(ind - 0.2, y_part, width=0.2, color='b', align='center', label='Particular')
ax.bar(x, y_kind, width=0.2, color='g', align='center', label='Kind')
ax.bar(ind + 0.2, y_abs, width=0.2, color='r', align='center', label='Abstract')
box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])

# Put a legend to the right of the current axis
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()
# plt.savefig('bars-arg-r1.png')


x = ['Token', 'Type', 'GloVe', 'ELMO', 'Hand', 'All']
ind = np.arange(len(x))
y_part = [1.2, 2.7, 2.2, 3.9, 3.4, 4.5]
y_hyp = [0, 3.7, 0, 6.0, 2.9, 7.2]
y_dyn = [6.0, 5.4, 4.6, 7.8, 7.9, 8.4]

plt.figure()
plt.suptitle('Predicate R1 scores')
ax = plt.subplot(111)
# ax.set_xlabel('Model features')
ax.set_ylabel('R1')
ax.bar(ind - 0.2, y_part, width=0.2, color='b', align='center', label='Particular')
ax.bar(x, y_hyp, width=0.2, color='g', align='center', label='Hypothetical')
ax.bar(ind + 0.2, y_dyn, width=0.2, color='r', align='center', label='Dynamic')
box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])

# Put a legend to the right of the current axis
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()
# plt.savefig('bars-pred-r1.png')

# SPR

In [None]:
arg_data_spr = arg_data_unq.copy()
arg_data_spr['Sentence.ID'] = arg_data_spr['Sentence.ID'].str.replace('sent_', '', regex=False)
arg_data_spr['Unique.ID'] = arg_data_spr.apply(lambda x: str(x['Sentence.ID']) + "_" + str(x['Span']).split(',')[0] + "_" + str(x['Span']).split(',')[-1], axis=1)
arg_data_spr = arg_data_spr.dropna()
arg_data_spr = arg_data_spr.groupby('Unique.ID', as_index=True).mean()

datafile_ = home + "/Research/protocols/data/spr/protoroles_eng_ud1.2_11082016.tsv"
spr = pd.read_csv(datafile_, sep="\t")
# pred_data token is 0 indexed in SPR
spr['Unique.ID'] = spr.apply(lambda x: str(x['Sentence.ID']) + "_" + str(x["Arg.Tokens.Begin"]) + "_" + str(x["Arg.Tokens.End"]), axis=1)
spr = spr[~spr['Is.Pilot']]
spr = spr.dropna()

spr = spr[spr['Split'].isin(['train', 'dev'])]

properties = ['awareness', 'volition', 'sentient', 'instigation', 'existed_before', 'existed_during', 'existed_after', 'was_for_benefit', 'change_of_location', 'change_of_state', 'was_used', 'change_of_possession', 'partitive']

print("Arg\n")
for prop in properties:
    prop_df = spr[spr['Property'] == prop]
    prop_df.loc[:, 'Response.ridit'] = prop_df.groupby('Annotator.ID')['Response'].transform(ridit)
    prop_df = prop_df.groupby('Unique.ID', as_index=False).mean().reset_index(drop=True)
    prop_df = prop_df.loc[:, ['Unique.ID', 'Response.ridit']].dropna()

    for attr in attributes_arg:
        prop_df.loc[:, attr] = prop_df['Unique.ID'].apply(lambda x: arg_data_spr.loc[x][attr] if x in arg_data_spr.index else None)

    prop_df = prop_df.dropna()
    print(prop.replace('_', ' '), 
          '&', np.round(spearmanr(prop_df[attributes_arg[0]].values, prop_df['Response.ridit'].values)[0], 2), 
          '&', np.round(spearmanr(prop_df[attributes_arg[1]].values, prop_df['Response.ridit'].values)[0], 2), 
          '&', np.round(spearmanr(prop_df[attributes_arg[2]].values, prop_df['Response.ridit'].values)[0], 2), "\\\\")


pred_data_spr = pred_data_unq.copy()
pred_data_spr['Sentence.ID'] = pred_data['Sentence.ID'].str.replace('sent_', '', regex=False)

lst_col = 'Context.Span'
x = pred_data_spr.assign(**{lst_col: pred_data_spr[lst_col].str.split(';')})
pred_data_spr = pd.DataFrame({col: np.repeat(x[col].values, x[lst_col].str.len()) for col in x.columns.difference([lst_col])}).assign(**{lst_col: np.concatenate(x[lst_col].values)})[x.columns.tolist()]
pred_data_spr['Unique.ID'] = pred_data_spr.apply(lambda x: str(x['Sentence.ID']) + "_" + str(x['Context.Span']).split(',')[0] + "_" + str(x['Context.Span']).split(',')[-1], axis=1)
pred_data_spr = pred_data_spr.dropna()
pred_data_spr = pred_data_spr.groupby('Unique.ID', as_index=True).mean()

print("\nPred\n")
for prop in properties:
    prop_df = spr[spr['Property'] == prop]
    prop_df.loc[:, 'Response.ridit'] = prop_df.groupby('Annotator.ID')['Response'].transform(ridit)
    prop_df = prop_df.groupby('Unique.ID', as_index=False).mean().reset_index(drop=True)
    prop_df = prop_df.loc[:, ['Unique.ID', 'Response.ridit']].dropna()
    for attr in attributes_pred:
        prop_df.loc[:, attr] = prop_df['Unique.ID'].apply(lambda x: pred_data_spr.loc[x][attr] if x in pred_data_spr.index else None)
    prop_df = prop_df.dropna()

    print(prop.replace('_', ' '), 
          '&', np.round(spearmanr(prop_df[attributes_pred[0]].values, prop_df['Response.ridit'])[0], 2), 
          '&', np.round(spearmanr(prop_df[attributes_pred[1]].values, prop_df['Response.ridit'])[0], 2), 
          '&', np.round(spearmanr(prop_df[attributes_pred[2]].values, prop_df['Response.ridit'])[0], 2), "\\\\")

# Analysis in paper

In [None]:
sigdig = 1
arg_dev = pd.read_csv('dev_preds_arg.tsv', sep='\t')
pred_dev = pd.read_csv('dev_preds_pred.tsv', sep='\t')

attributes_arg_dev = ['Is.Particular.Pred', 'Is.Kind.Pred', 'Is.Abstract.Pred']
attributes_pred_dev = ['Is.Particular.Pred', 'Is.Hypothetical.Pred', 'Is.Dynamic.Pred']

pron_df = arg_dev[arg_dev['Lemma'].isin(['you', 'they'])]
print("Pronomial sentences containing you/they with high Is.Kind values\n")
print(pron_df[pron_df['Is.Kind.Norm'] > 0]['Sentences'].sample(5), "\n")

hyp_df = pred_dev[(pred_dev['Sentences'].str.contains('if'))]
print("Conditional sentences(with if) with low Is.Hypothetical scores\n")
print(pred_dev[(pred_dev['Sentences'].str.contains('if ', regex=False)) & 
               (pred_dev['Is.Hypothetical.Pred'] < -0.3)]['Sentences'].sample(5), "\n")

## Thing words

In [None]:
mpl.rcParams.update({'font.size': 15})
thing_words = arg_dev[arg_dev['Lemma'].str.contains('thing')]

plt.figure()
sns.distplot(thing_words['Is.Particular.Norm'], hist=False, label='Part Ann').get_figure()
sns.distplot(thing_words['Is.Particular.Pred'], hist=False, label='Part Pred').get_figure()
sns.distplot(thing_words['Is.Kind.Norm'], hist=False, label='Kind Ann').get_figure()
sns.distplot(thing_words['Is.Kind.Pred'], hist=False, label='Kind Pred').get_figure()
sns.distplot(thing_words['Is.Abstract.Norm'], hist=False, label='Abs Ann').get_figure()
sns.distplot(thing_words['Is.Abstract.Pred'], hist=False, label='Abs Pred').get_figure()
plt.xlabel('Normalized score')
plt.show()
# plt.savefig('things.png', transparent=True)

## POS and DEPREL

In [None]:
print("\nArg POS")
pprint([(a, len(arg_dev[arg_dev['POS'] == a])) for a in list(set(arg_dev['POS'].tolist()))])
for pos in list(set(arg_dev['POS'].tolist())):
    data_new = arg_dev[arg_dev['POS'] == pos]
    print(pos, '&', np.round(pearsonr(data_new['Is.Particular.Norm'], data_new['Is.Particular.Pred'])[0] * 100, sigdig), 
          '&', np.round(r1_score(data_new['Is.Particular.Norm'], data_new['Is.Particular.Pred']) * 100, sigdig), 
          '&', np.round(pearsonr(data_new['Is.Kind.Norm'], data_new['Is.Kind.Pred'])[0] * 100, sigdig), 
          '&', np.round(r1_score(data_new['Is.Kind.Norm'], data_new['Is.Kind.Pred']) * 100, sigdig),
          '&', np.round(pearsonr(data_new['Is.Abstract.Norm'], data_new['Is.Abstract.Pred'])[0] * 100, sigdig), 
          '&', np.round(r1_score(data_new['Is.Abstract.Norm'], data_new['Is.Abstract.Pred']) * 100, sigdig),
          '&', np.round(r1_score(data_new.loc[:, ['Is.Particular.Norm', 'Is.Kind.Norm', 'Is.Abstract.Norm']].values, data_new.loc[:, ['Is.Particular.Pred', 'Is.Kind.Pred', 'Is.Abstract.Pred']].values) * 100, sigdig), "\\\\")

print("\nArg DEPREL")
pprint([(a, len(arg_dev[arg_dev['DEPREL'] == a])) for a in list(set(arg_dev['DEPREL'].tolist()))])
for deprel in list(set(arg_dev['DEPREL'].tolist())):
    data_new = arg_dev[arg_dev['DEPREL'] == deprel]
    print(deprel, '&', np.round(pearsonr(data_new['Is.Particular.Norm'], data_new['Is.Particular.Pred'])[0] * 100, sigdig),
          '&', np.round(r1_score(data_new['Is.Particular.Norm'], data_new['Is.Particular.Pred']) * 100, sigdig), 
          '&', np.round(pearsonr(data_new['Is.Kind.Norm'], data_new['Is.Kind.Pred'])[0] * 100, sigdig), 
          '&', np.round(r1_score(data_new['Is.Kind.Norm'], data_new['Is.Kind.Pred']) * 100, sigdig), 
          '&', np.round(pearsonr(data_new['Is.Abstract.Norm'], data_new['Is.Abstract.Pred'])[0] * 100, sigdig), 
          '&', np.round(r1_score(data_new['Is.Abstract.Norm'], data_new['Is.Abstract.Pred']) * 100, sigdig), 
          '&', np.round(r1_score(data_new.loc[:, ['Is.Particular.Norm', 'Is.Kind.Norm', 'Is.Abstract.Norm']].values, data_new.loc[:, ['Is.Particular.Pred', 'Is.Kind.Pred', 'Is.Abstract.Pred']].values) * 100, sigdig), "\\\\")


print("\nPred POS")
pprint([(a, len(pred_dev[pred_dev['POS'] == a])) for a in list(set(pred_dev['POS'].tolist()))])
for pos in list(set(pred_dev['POS'].tolist())):
    data_new = pred_dev[pred_dev['POS'] == pos]
    print(pos, '&', np.round(pearsonr(data_new['Is.Particular.Norm'], data_new['Is.Particular.Pred'])[0] * 100, sigdig), 
          '&', np.round(r1_score(data_new['Is.Particular.Norm'], data_new['Is.Particular.Pred']) * 100, sigdig),
          '&', np.round(pearsonr(data_new['Is.Hypothetical.Norm'], data_new['Is.Hypothetical.Pred'])[0] * 100, sigdig), 
          '&', np.round(r1_score(data_new['Is.Hypothetical.Norm'], data_new['Is.Hypothetical.Pred']) * 100, sigdig), 
          '&', np.round(pearsonr(data_new['Is.Dynamic.Norm'], data_new['Is.Dynamic.Pred'])[0] * 100, sigdig), 
          '&', np.round(r1_score(data_new['Is.Dynamic.Norm'], data_new['Is.Dynamic.Pred']) * 100, sigdig), 
          '&', np.round(r1_score(data_new.loc[:, ['Is.Particular.Norm', 'Is.Hypothetical.Norm', 'Is.Dynamic.Norm']].values, data_new.loc[:, ['Is.Particular.Pred', 'Is.Hypothetical.Pred', 'Is.Dynamic.Pred']].values) * 100, sigdig), "\\\\")

print("\nPred DEPREL")
pprint([(a, len(pred_dev[pred_dev['DEPREL'] == a])) for a in list(set(pred_dev['DEPREL'].tolist()))])
for deprel in list(set(pred_dev['DEPREL'].tolist())):
    data_new = pred_dev[pred_dev['DEPREL'] == deprel]
    print(deprel,
          '&', np.round(pearsonr(data_new['Is.Particular.Norm'], data_new['Is.Particular.Pred'])[0] * 100, sigdig), 
          '&', np.round(r1_score(data_new['Is.Particular.Norm'], data_new['Is.Particular.Pred']) * 100, sigdig), 
          '&', np.round(pearsonr(data_new['Is.Hypothetical.Norm'], data_new['Is.Hypothetical.Pred'])[0] * 100, sigdig), 
          '&', np.round(r1_score(data_new['Is.Hypothetical.Norm'], data_new['Is.Hypothetical.Pred']) * 100, sigdig),
          '&', np.round(pearsonr(data_new['Is.Dynamic.Norm'], data_new['Is.Dynamic.Pred'])[0] * 100, sigdig),
          '&', np.round(r1_score(data_new['Is.Dynamic.Norm'], data_new['Is.Dynamic.Pred']) * 100, sigdig), 
          '&', np.round(r1_score(data_new.loc[:, ['Is.Particular.Norm', 'Is.Hypothetical.Norm', 'Is.Dynamic.Norm']].values, data_new.loc[:, ['Is.Particular.Pred', 'Is.Hypothetical.Pred', 'Is.Dynamic.Pred']].values) * 100, sigdig), "\\\\")


# for pos in list(set(arg_dev['POS'].tolist()).intersection(set(pred_dev['POS'].tolist()))):
#     data_new = arg_dev[arg_dev['POS'] == pos]
#     data_new2 = pred_dev[pred_dev['POS'] == pos]
#     print(pos, 
#           '&', np.round(pearsonr(data_new['Is.Kind.Norm'], data_new['Is.Kind.Pred'])[0] * 100, sigdig), 
#           '&', np.round(r1_score(data_new['Is.Kind.Norm'], data_new['Is.Kind.Pred']) * 100, sigdig), 
#           '&', np.round(pearsonr(data_new['Is.Abstract.Norm'], data_new['Is.Abstract.Pred'])[0] * 100, sigdig), 
#           '&', np.round(r1_score(data_new['Is.Abstract.Norm'], data_new['Is.Abstract.Pred']) * 100, sigdig), 
#           '&', np.round(pearsonr(data_new2['Is.Hypothetical.Norm'], data_new2['Is.Hypothetical.Pred'])[0] * 100, sigdig), 
#           '&', np.round(r1_score(data_new2['Is.Hypothetical.Norm'], data_new2['Is.Hypothetical.Pred']) * 100, sigdig), "\\\\")

# for deprel in list(set(arg_dev['POS'].tolist()).intersection(set(pred_dev['POS'].tolist()))):
#     data_new = arg_dev[arg_dev['DEPREL'] == deprel]
#     data_new2 = pred_dev[pred_dev['DEPREL'] == deprel]
#     print(deprel, 
#           '&', np.round(pearsonr(data_new['Is.Kind.Norm'], data_new['Is.Kind.Pred'])[0] * 100, sigdig), 
#           '&', np.round(r1_score(data_new['Is.Kind.Norm'], data_new['Is.Kind.Pred']) * 100, sigdig), 
#           '&', np.round(pearsonr(data_new['Is.Abstract.Norm'], data_new['Is.Abstract.Pred'])[0] * 100, sigdig), 
#           '&', np.round(r1_score(data_new['Is.Abstract.Norm'], data_new['Is.Abstract.Pred']) * 100, sigdig), 
#           '&', np.round(pearsonr(data_new2['Is.Hypothetical.Norm'], data_new2['Is.Hypothetical.Pred'])[0] * 100, sigdig), 
#           '&', np.round(r1_score(data_new2['Is.Hypothetical.Norm'], data_new2['Is.Hypothetical.Pred']) * 100, sigdig), "\\\\")



In [None]:
pred_dev[(pred_dev['Is.Particular.Norm']<-0.2) & (pred_dev['Is.Hypothetical.Norm']<-0.2)][['Unique.ID', 'Sentences']].sample(5)

In [None]:
 
print(np.round(pearsonr(pred_dev['Is.Particular.Norm'], pred_dev['Is.Hypothetical.Norm'])[0] * 100, sigdig))
print(np.round(pearsonr(pred_dev['Is.Particular.Pred'], pred_dev['Is.Hypothetical.Pred'])[0] * 100, sigdig))
print(np.round(pearsonr(pred_dev['Is.Particular.Norm'], pred_dev['Is.Hypothetical.Pred'])[0] * 100, sigdig))
print(np.round(pearsonr(pred_dev['Is.Particular.Pred'], pred_dev['Is.Hypothetical.Norm'])[0] * 100, sigdig))

In [None]:
sigdig=3
def create_corr_df(df, attributes):
    '''
        Creates a dataframe of correlations among attributes in df
    '''
    df_corr = {}
    for attr in attributes:
        df_corr[attr] = {}
        for attr1 in attributes:
            df_corr[attr][attr1] = np.round(pearsonr(df[attr], df[attr1])[0], sigdig)
    return pd.DataFrame(df_corr)

def create_dist(df, attributes):
    '''
        Finds mean, median, variance of each attribute in df
    '''
    df_props = {}
    for attr in attributes:
        df_props[attr] = {}
        df_props[attr]['mean'] = np.round(np.mean(df[attr]), sigdig)
        df_props[attr]['median'] = np.round(np.median(df[attr]), sigdig) 
        df_props[attr]['var'] = np.round(np.var(df[attr]), sigdig)
    return df_props

## Argument annotations

In [None]:
display(pd.DataFrame(create_corr_df(arg_data_unq, attributes_arg)))
display(pd.DataFrame(create_dist(arg_data_unq, attributes_arg)))

fig, ax = plt.subplots(figsize=[20, 5], nrows=1, ncols=3, squeeze=False, sharey='row')
for i in range(3):
    sns.distplot(arg_data_unq[attributes_arg[i]], ax=ax[0][i]).get_figure()
plt.show()


## Argument predictions

In [None]:
display(pd.DataFrame(create_corr_df(arg_dev, attributes_arg_dev)))
display(pd.DataFrame(create_dist(arg_dev, attributes_arg_dev)))

fig, ax = plt.subplots(figsize=[20, 5], nrows=1, ncols=3, squeeze=False, sharey='row')
for i in range(3):
    sns.distplot(arg_dev[attributes_arg_dev[i]], ax=ax[0][i]).get_figure()
plt.show()

## Predicate annotations

In [None]:
display(pd.DataFrame(create_corr_df(pred_data_unq, attributes_pred)))
display(pd.DataFrame(create_dist(pred_data_unq, attributes_pred)))

fig, ax = plt.subplots(figsize=[20, 5], nrows=1, ncols=3, squeeze=False, sharey='row')
for i in range(3):
    sns.distplot(pred_data_unq[attributes_pred[i]], ax=ax[0][i]).get_figure()
plt.show()

## Predicate predictions

In [None]:
display(pd.DataFrame(create_corr_df(pred_dev, attributes_pred_dev)))
display(pd.DataFrame(create_dist(pred_dev, attributes_pred_dev)))

fig, ax = plt.subplots(figsize=[20, 5], nrows=1, ncols=3, squeeze=False, sharey='row')
for i in range(3):
    sns.distplot(pred_dev[attributes_pred_dev[i]], ax=ax[0][i]).get_figure()
plt.show()

In [None]:
display(arg_dev[(arg_dev['Is.Particular.Norm']<-0.2) & (arg_dev['Is.Abstract.Norm']<-0.2)][['Unique.ID', 'Sentences', 'Word', 'Is.Particular.Norm', 'Is.Particular.Pred', 'Is.Abstract.Norm', 'Is.Abstract.Pred', 'Is.Kind.Norm', 'Is.Kind.Pred']].sort_values(by=['Is.Particular.Pred', 'Is.Abstract.Pred'], ascending=False))

## Proper nouns

In [None]:
upper = arg_dev[(arg_dev['POS']=='PROPN')]

fig, ax = plt.subplots(figsize=[20, 5], nrows=1, ncols=3, squeeze=False, sharey='row')
for i in range(3):
    sns.distplot(upper[attributes_arg[i]], ax=ax[0][i]).get_figure()
plt.show()

display(arg_dev[(arg_dev['POS']=='PROPN')][['Unique.ID', 'Sentences', 'Word', 'Is.Particular.Norm', 'Is.Particular.Pred', 'Is.Kind.Norm', 'Is.Kind.Pred', 'Is.Abstract.Norm', 'Is.Abstract.Pred']].sort_values(by=['Is.Particular.Pred', 'Is.Kind.Pred'], ascending=False))

### Particular and dynamic predicates

In [None]:
display(pred_dev[(pred_dev['Is.Particular.Pred']>0.1) & (pred_dev['Is.Dynamic.Pred']>0.1)][['Unique.ID', 'Sentences', 'Word', 'Is.Particular.Norm', 'Is.Particular.Pred', 'Is.Hypothetical.Norm', 'Is.Hypothetical.Pred', 'Is.Dynamic.Norm', 'Is.Dynamic.Pred']].sort_values(by=['Is.Particular.Pred', 'Is.Dynamic.Pred'], ascending=False))

### Particular and hypothetical predicates

In [None]:
display(pred_dev[(pred_dev['Is.Particular.Pred']>0.1) & (pred_dev['Is.Hypothetical.Pred']>0.1)][['Unique.ID', 'Sentences', 'Word', 'Is.Particular.Norm', 'Is.Particular.Pred', 'Is.Hypothetical.Norm', 'Is.Hypothetical.Pred', 'Is.Dynamic.Norm', 'Is.Dynamic.Pred']].sort_values(by=['Is.Particular.Pred', 'Is.Hypothetical.Pred'], ascending=False))

## Clausal versus other DEPREL

In [None]:
clause_deprel = ['csubj', 'ccomp', 'xcomp', 'advcl', 'acl']
other_deprel = ['root', 'conj', 'parataxis']

pprint([(a, len(pred_dev[pred_dev['DEPREL'].isin(a)])) for a in [clause_deprel, other_deprel]])

for deprel_set in [clause_deprel, other_deprel]:
    data_new = pred_dev[pred_dev['DEPREL'].isin(deprel_set)]
    print('&', np.round(pearsonr(data_new['Is.Particular.Norm'], data_new['Is.Particular.Pred'])[0] * 100, sigdig), 
          '&', np.round(r1_score(data_new['Is.Particular.Norm'], data_new['Is.Particular.Pred']) * 100, sigdig), 
          '&', np.round(pearsonr(data_new['Is.Hypothetical.Norm'], data_new['Is.Hypothetical.Pred'])[0] * 100, sigdig), 
          '&', np.round(r1_score(data_new['Is.Hypothetical.Norm'], data_new['Is.Hypothetical.Pred']) * 100, sigdig),
          '&', np.round(pearsonr(data_new['Is.Dynamic.Norm'], data_new['Is.Dynamic.Pred'])[0] * 100, sigdig),
          '&', np.round(r1_score(data_new['Is.Dynamic.Norm'], data_new['Is.Dynamic.Pred']) * 100, sigdig), 
          '&', np.round(r1_score(data_new.loc[:, ['Is.Particular.Norm', 'Is.Hypothetical.Norm', 'Is.Dynamic.Norm']].values, data_new.loc[:, ['Is.Particular.Pred', 'Is.Hypothetical.Pred', 'Is.Dynamic.Pred']].values) * 100, sigdig), "\\\\")
print(np.round(pearsonr(pred_dev['Is.Particular.Norm'], pred_dev['Is.Particular.Pred'])[0] * 100, sigdig), '&', np.round(r1_score(pred_dev['Is.Particular.Norm'], pred_dev['Is.Particular.Pred']) * 100, sigdig))