In [2]:
from collections import defaultdict
from copy import copy
import numpy as np
import operator
import os
import pickle

import pandas as pd
import tensorly as tl


import matplotlib.pyplot as plt
%pylab inline

from eval_tensor import VerbTensorEvaluator

import logging
logging.basicConfig(level=logging.INFO,
        format='%(levelname)-8s [%(lineno)d] %(message)s')

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [3]:
pylab.rcParams['figure.figsize'] = (20, 10)

In [4]:
#matplotlib.rcParams.update({'font.size': 14})

# 1 One-mode similarity

## 1.1 Data

### 1.1.1 [SimVerb](http://people.ds.cam.ac.uk/dsg40/simverb.html) (Gerz+ EMNLP 2016)

In [13]:
simverb = pd.read_csv('/mnt/permanent/Language/English/Data/verb-similarity/simverb-3500/SimVerb-3500.txt', 
                      sep='\t', header=None, names=['verb1', 'verb2', 'pos', 'sim', 'rel'])

In [14]:
simverb.head()

Unnamed: 0,verb1,verb2,pos,sim,rel
0,take,remove,V,6.81,SYNONYMS
1,walk,trail,V,4.81,COHYPONYMS
2,feed,starve,V,1.49,ANTONYMS
3,shine,polish,V,7.8,SYNONYMS
4,calculate,add,V,5.98,HYPER/HYPONYMS


In [15]:
simverb.describe(percentiles=[])

Unnamed: 0,sim
count,3500.0
mean,4.291554
std,2.652621
min,0.0
50%,4.32
max,9.96


In [16]:
simverb.groupby('rel').sim.describe(percentiles=[]).sort_values('count', ascending=False)

Unnamed: 0_level_0,count,mean,std,min,50%,max
rel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
NONE,2093.0,3.431276,2.342695,0.0,3.15,9.79
HYPER/HYPONYMS,800.0,6.012525,2.104537,0.5,6.31,9.96
SYNONYMS,306.0,6.78915,2.10449,0.5,7.14,9.96
COHYPONYMS,190.0,4.435526,2.381992,0.0,4.665,9.3
ANTONYMS,111.0,0.977748,1.074232,0.0,0.66,6.04


### 1.1.2 SimLex-999

In [17]:
simlex = pd.read_csv('/mnt/permanent/Language/English/Data/SimLex-999/SimLex-999.txt', sep='\t')

In [18]:
simlex.head()

Unnamed: 0,word1,word2,POS,SimLex999,conc(w1),conc(w2),concQ,Assoc(USF),SimAssoc333,SD(SimLex)
0,old,new,A,1.58,2.72,2.81,2,7.25,1,0.41
1,smart,intelligent,A,9.2,1.75,2.46,1,7.11,1,0.67
2,hard,difficult,A,8.77,3.76,2.21,2,5.94,1,1.19
3,happy,cheerful,A,9.55,2.56,2.34,1,5.85,1,2.18
4,hard,easy,A,0.95,3.76,2.07,2,5.82,1,0.93


* conc(w1): The concreteness rating of word1 on a scale of 1-7. Taken from the University of South Florida Free Association Norms database. 

* conc(w2): The concreteness rating of word2 on a scale of 1-7. Taken from the University of South Florida Free Association Norms database. 

* concQ: The quartile the pair occupies based on the two concreteness ratings. Used for some analyses in the above paper. 

* Assoc(USF): The strength of free association from word1 to word2. Values are taken from the University of South Florida Free Association Dataset. 

* SimAssoc333: Binary indicator of whether the pair is one of the 333 most associated in the dataset (according to Assoc(USF)). This subset of SimLex999 is often the hardest for computational models to capture because the noise from high association can confound the similarity rating. See the paper for more details. 

* SD(SimLex): The standard deviation of annotator scores when rating this pair. Low values indicate good agreement between the 15+ annotators on the similarity value SimLex999. Higher scores indicate less certainty. 


In [19]:
simlex.describe(percentiles=[])

Unnamed: 0,SimLex999,conc(w1),conc(w2),concQ,Assoc(USF),SimAssoc333,SD(SimLex)
count,999.0,999.0,999.0,999.0,999.0,999.0,999.0
mean,4.561572,3.657087,3.568629,2.501502,0.751512,0.333333,1.274505
std,2.614663,1.13105,1.159572,1.118145,1.344569,0.471641,0.366278
min,0.23,1.19,1.19,1.0,0.0,0.0,0.34
50%,4.67,3.83,3.66,3.0,0.25,0.0,1.31
max,9.8,5.0,5.0,4.0,8.85,1.0,2.18


In [20]:
simlex.groupby('POS').size()

POS
A    111
N    666
V    222
dtype: int64

## 1.2 Testing the verb tensor

In [28]:
evalor = VerbTensorEvaluator()

def eval_for_for(task_df0, mode_to_test):
    # normlz_vocb=True, lmbda=False, decomp_algo='tucker', weight_name='log_freq'
    data = []
    for cutoff in [10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000, 20000, 50000, 100000, 200000]:
        for exp in range(10):
            rank = 2**exp
            try:
                score = evalor.test_sim(task_df0, mode_to_test=mode_to_test, cutoff=cutoff, rank=rank)
                data.append((cutoff, rank, score))
            except FileNotFoundError as e:
                pass
            except ValueError as e:
                logging.warning((cutoff, rank, e))
    df = pd.DataFrame(data, columns=['cutoff', 'rank', 'corr'])
    return df.sort_values('corr', ascending=False)

In [29]:
df = eval_for_for(simlex, mode_to_test='nsubj')#, cutoff=500, rank=32)['tensor_sim']



In [30]:
df.head()

Unnamed: 0,cutoff,rank,corr
36,2000,256,0.06647
35,2000,128,0.059054
1,10,2,0.053684
3,20,2,0.045598
13,200,8,0.033862


In [31]:
df = eval_for_for(simverb, mode_to_test='ROOT')#, cutoff=500, rank=32)['tensor_sim']

  np.linalg.norm, 1, factors[mode_i]).reshape((-1,1))
  np.linalg.norm, 1, factors[mode_i]).reshape((-1,1))


In [32]:
df.head()

Unnamed: 0,cutoff,rank,corr
19,500,32,0.103713
18,500,16,0.09655
27,1000,128,0.086127
1,10,2,0.082623
26,1000,64,0.08167


In [33]:
eval_for_for(simlex, mode_to_test='dobj').head()

  np.linalg.norm, 1, factors[mode_i]).reshape((-1,1))
  np.linalg.norm, 1, factors[mode_i]).reshape((-1,1))


Unnamed: 0,cutoff,rank,corr
26,1000,64,0.075883
25,1000,32,0.069587
43,5000,128,0.065126
27,1000,128,0.053762
19,500,32,0.05027


In [34]:
def plot_cutoff(df, col):
    for weight in df.weight.unique():
        df0 = df[(df.weight==weight)&(df.rank_==256)]
        plt.plot(df0.cutoff, df0[col])
        plt.legend(df.weight.unique())

In [35]:
def plot_rank(df, col):
    for weight in df.weight.unique():
        df0 = df[(df.weight==weight)&(df.cutoff==100)]
        plt.plot(df0.rank_, df0[col])
        plt.legend(df.weight.unique())

In [36]:
if False:
    df = compare_100_256(simverb, 'ROOT', 'sim')
    plot_cutoff(df, 'sim')
    plot_rank(df, 'sim')
    df = compare_100_256(simlex, mode_to_test='nsubj', col='SimLex999')
    plot_cutoff(df, 'SimLex999')
    plot_rank(df, 'SimLex999')
    df = compare_100_256(simlex, mode_to_test='dobj', col='SimLex999')
    plot_cutoff(df, 'SimLex999')
    plot_rank(df, 'SimLex999')
    # weights = pd.unique(simverb_res.weight)

In [37]:
def plot_results(df0, col='sim', save_filen=''):#else ''
    
    df0 = df0[df0.rank_ ==256]
    weights = pd.unique(df0.sort_values(col, ascending=False).weight)
    for weight in weights:
        df = df0[df0.weight==weight].sort_values('rank_')
        plt.plot(df.rank_, df[col])#, c=color)
    #plt.xscale('log')
    _ = plt.legend(weights)
    if save_filen:
        #plt.rc('text', usetex = True)
        #plt.figure(1, figsize = (3, 2))#6, 4))
        #plt.savefig('/home/makrai/repo/paper/LREC20/verbtensor/img/{}'.format(save_filen))
        filen = '/home/makrai/repo/paper/Coling2020/verbtensor/img/{}.png'.format(save_filen)
        plt.savefig(filen)

In [38]:
#plot_results(simverb_res)#, save_filen='SimVerb')

In [39]:
#plot_results(simlex_subj, col='SimLex999')#, save_filen='simLex-subj')

In [40]:
#plot_results(simlex_obj, col='SimLex999')#, save_filen='simLex-obj')

# 2 SVO triples (_al et_ Sadrzadeh 2011--2014)

## 2.1 Datasets

  * [GS’11](http://www.cs.ox.ac.uk/activities/compdistmeaning/GS2011data.txt) provided by Grefenstette and Sadrzadeh (EMNLP 2011)
      * each verb pair takes the same subject and object
      * the task has an aspect of a verb sense disambiguation 
          * As discussed in previous work
            (Kartsaklis and Sadrzadeh, 2013; Milajevs+ 2014; Polajnar+ 2014), GS’11
      * For example, the transitive verb “run” is known as polysemous: operate/move
        * “run” and “operate” are similar when subj = “people” and obj = “company”
        * In the same [context, not similar to] “move”
  * ML’10 provided by Mitchell and Lapata (2010),
    * pairs of verb-object phrases and
  * KS’13 provided by Kartsaklis and Sadrzadeh (2013)
    * complements ML’10 by incorporating an appropriate subject for each VO
  * KS’14 provided by [Kartsaklis and Sadrzadeh (2014)](https://arxiv.org/abs/1405.2874)
    * reannotated version of KS’13 using a cloud sourcing service
  * the latter three require one to capture the topical similarity
    rather than the disambiguation aspect (Polajnar+ 2014)

In [41]:
verb_sim_data_dir = '/mnt/permanent/Language/English/Data/verb-similarity/Sadrzadeh/'

In [42]:
def read_sim_data(filen):
    return pd.read_csv(os.path.join(verb_sim_data_dir, filen), sep=' ')

### 2.1.1 Pairs of SVO triples with the same but ambiguous verb (GS11)

In [43]:
gs11 = read_sim_data('GS2011data.txt').groupby(['verb', 'subject', 'object', 'landmark', 'hilo']).mean()
print(gs11.shape)
gs11.head()

(200, 1)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,input
verb,subject,object,landmark,hilo,Unnamed: 5_level_1
accept,government,proposal,bear,LOW,2.666667
accept,government,proposal,receive,HIGH,2.583333
accept,government,recommendation,bear,LOW,3.5
accept,government,recommendation,receive,HIGH,3.333333
accept,lawyer,conviction,bear,LOW,3.333333


In [83]:
gs11.groupby('verb').size().sort_values(ascending=False).head()

verb
accept     20
buy        20
draw       20
meet       20
provide    20
dtype: int64

In [84]:
gs11.groupby('landmark').size().sort_values(ascending=False).head()

landmark
allege     10
attract    10
test       10
supply     10
state      10
dtype: int64

In [44]:
cols = ['sentence_id', 'adj_subj', 'subj', 'landmark', 'verb', 'adj_obj', 'obj']
gs12 = read_sim_data('GS2012data.txt').groupby(cols).mean().drop(columns=['annotator_id'])
gs12.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,annotator_score
sentence_id,adj_subj,subj,landmark,verb,adj_obj,obj,Unnamed: 7_level_1
1,statistical,table,show,express,good,result,6.06
2,statistical,table,show,depict,good,result,5.9
3,recent,study,show,express,significant,correlation,5.72
4,recent,study,show,depict,significant,correlation,5.92
5,annual,figure,show,express,substantial,increase,5.74


In [45]:
cols = ['sentence_id', 'adj_subj', 'subj', 'landmark', 'verb', 'adj_obj', 'obj']
gsk13 = read_sim_data('pickering-judgements.txt').groupby(cols).mean().drop(columns=['annotator_id'])
gsk13.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,annotator_score
sentence_id,adj_subj,subj,landmark,verb,adj_obj,obj,Unnamed: 7_level_1
1,private,company,file,register,annual,account,5.44186
2,private,company,file,smooth,annual,account,2.302326
3,young,woman,file,register,long,nail,2.069767
4,young,woman,file,smooth,long,nail,5.023256
5,local,government,file,register,criminal,charge,5.44186


In [46]:
cols = ['subject1', 'verb1', 'object1', 'subject2', 'verb2', 'object2']
ks13_mitchell = read_sim_data('emnlp2013_ml.txt').groupby(cols).mean()
print(ks13_mitchell.shape)
ks13_mitchell.head()

(108, 1)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,score
subject1,verb1,object1,subject2,verb2,object2,Unnamed: 6_level_1
agent,sell,property,family,buy,home,2.777778
agent,sell,property,group,hold,meeting,1.666667
author,write,book,delegate,buy,land,1.5
author,write,book,man,hear,word,2.055556
author,write,book,writer,read,word,2.777778


In [47]:
def get_cols(i):
    return ['subject{}'.format(i), 'verb{}'.format(i), 'object{}'.format(i)]

def get_one_sent_from_pair(i):
    df = ks13_mitchell.reset_index()[get_cols(i)]
    df.columns = get_cols('')
    return df

ks13_long = pd.concat(get_one_sent_from_pair(i) for i in [1, 2])
ks13_long = ks13_long.drop_duplicates()

### 2.1.2 Kartsaklis and Sadrzadeh, Turk

In [48]:
cols = ['subject1', 'verb1', 'object1', 'subject2', 'verb2', 'object2']
ks13_turk = read_sim_data('emnlp2013_turk.txt').groupby(cols).mean().drop(columns=['annotator']).reset_index()
ks13_turk.head()

Unnamed: 0,subject1,verb1,object1,subject2,verb2,object2,score
0,agent,sell,property,family,buy,home,3.125
1,agent,sell,property,group,hold,meeting,1.166667
2,author,write,book,delegate,buy,land,1.130435
3,author,write,book,man,hear,word,1.64
4,author,write,book,writer,read,word,3.166667


In [85]:
ks13_turk.groupby('verb1').size().sort_values(ascending=False).head()

verb1
achieve    5
use        5
win        4
provide    4
fight      3
dtype: int64

In [86]:
ks13_turk.groupby('verb2').size().sort_values(ascending=False).head()

verb2
use        7
provide    5
leave      4
buy        4
reach      3
dtype: int64

### 2.1.3 [Verb prediction task by Jenatton+ (NIPS 2012)](https://everest.hds.utc.fr/doku.php?id=en:lfmnips12)

In [49]:
dataset_dir = '/mnt/permanent/Language/English/Data/verb-similarity/SVO-tensor-dataset/'
def get_index(pos):
    with open(os.path.join(dataset_dir, 'svo-{}s.lst'.format(pos))) as infile:
        return {i+1: ' '.join(line.strip().split('_')[2:-1]) for i, line in enumerate(infile)}

In [50]:
svo_df = pd.read_csv(os.path.join(dataset_dir, 'svo_data_train_1000000.dat'), sep='\t', header=None, 
                     names=['subject', 'verb', 'object'])

In [51]:
index_verb = get_index('verb')
index_noun = get_index('noun')

In [52]:
svo_df.subject = svo_df.subject.apply(index_noun.get)
svo_df.verb = svo_df.verb.apply(index_verb.get)
svo_df.object = svo_df.object.apply(index_noun.get)

In [72]:
svo_df.sample(5)

Unnamed: 0,subject,verb,object
606059,process,use,assumption
985729,college,hold,portfolio
892307,theory,base,linear b
248801,bear,beg,food
127693,band,intend,album


## 2.2 Similarity (KS Turk)

In [54]:
eval_for_for(ks13_turk, mode_to_test='svo').head()



Unnamed: 0,cutoff,rank,corr
35,2000,128,0.653291
26,1000,64,0.645071
34,2000,64,0.631077
27,1000,128,0.620881
36,2000,256,0.560317


In [58]:
weights = ["dice_sali", "iact_info", "iact_sali", "log_dice", "log_freq", "niact", "npmi", "pmi", "salience"]

 
def eval_for_weight(task_df0, mode_to_test='svo', cutoff=2000, rank=128):
    # normlz_vocb=True, lmbda=False, decomp_algo='tucker', weight_name='log_freq'
    data = []
    for weight_name in weights:
        try:
            score_d = evalor.test_sim(task_df0, mode_to_test=mode_to_test, cutoff=cutoff, rank=rank, weight=weight_name)
            data.append((cutoff, rank, weight_name, score_d))
        except FileNotFoundError as e:
            logging.warning(weight_name)
        except ValueError as e:
            logging.warning((cutoff, rank, e))
    df = pd.DataFrame(data, columns=['cutoff', 'rank', 'weight', 'corr'])
    return df.sort_values('corr', ascending=False)

eval_for_weight(ks13_turk)

Unnamed: 0,cutoff,rank,weight,corr
4,2000,128,log_freq,0.653291
6,2000,128,npmi,0.640273
8,2000,128,salience,0.639135
7,2000,128,pmi,0.635618
3,2000,128,log_dice,0.607359
0,2000,128,dice_sali,0.600164
2,2000,128,iact_sali,0.560026
5,2000,128,niact,0.545641
1,2000,128,iact_info,0.525882


In [61]:
evalor.test_sim(ks13_turk)

0.653290539345317

In [None]:
#plot_cutoff(df, 'score')

In [None]:
#plot_rank(df, 'score')

In [None]:
#plot_results(df, col='score')#, save_filen='svo')

In [None]:
#df.plot.bar(x='weight', y='score')

## 2.3 Verb prediction

In [71]:
ks13_turk

Unnamed: 0,subject1,verb1,object1,subject2,verb2,object2,score
0,agent,sell,property,family,buy,home,3.125000
1,agent,sell,property,group,hold,meeting,1.166667
2,author,write,book,delegate,buy,land,1.130435
3,author,write,book,man,hear,word,1.640000
4,author,write,book,writer,read,word,3.166667
...,...,...,...,...,...,...,...
103,woman,drink,water,doctor,use,test,1.125000
104,woman,drink,water,system,use,method,1.083333
105,worker,join,party,employee,leave,company,1.400000
106,writer,read,word,family,receive,letter,2.090909


In [73]:
ks13_turk.groupby('verb2').size().sort_values(ascending=False)

verb2
use          7
provide      5
leave        4
buy          4
reach        3
exercise     3
cut          3
emphasise    3
lift         2
pass         2
play         2
pose         2
present      2
meet         2
raise        2
increase     2
receive      2
reduce       2
remember     2
set          2
start        2
stretch      2
suffer       2
join         2
win          2
hold         2
begin        2
cross        2
discuss      2
close        2
encourage    2
face         2
cause        2
follow       2
hear         2
collect      2
require      1
ask          1
share        1
address      1
wave         1
send         1
sell         1
satisfy      1
read         1
consider     1
help         1
develop      1
drink        1
produce      1
express      1
pay          1
offer        1
need         1
acquire      1
achieve      1
dtype: int64

In [65]:
for weight in ['log_freq', 'pmi', 'iact_info', 'salience', 'iact_sali', 'log_dice', 'dice_sali']:
    logging.info(weight)
    evalor.predict_verb(svo_df, weight=weight)
    #except FileNotFoundError as e:
    #logging.warning(e)

INFO     [2] log_freq


TypeError: cannot unpack non-iterable NoneType object

In [63]:
%time for_weight_and_rank()

INFO     [3] log_freq


TypeError: cannot unpack non-iterable NoneType object

With 44 k $\times$ 9 k $\times$ 39 k-s tenosor:

|assoc measure|rank|verb|
|-|-|-|
|pmi|256|318|
|salience|128|318|
|log-Dice|128|270|

In [None]:
svo_sim.sum(numeric_only=True).sort_values(ascending=False).head()/svo_sim.shape[0]

|verb|`* 1`|`* lmbda`|
|----|--|------|
|unnorm|130|**272**|
|norm|0|24|

Majoroty baseline...

In [None]:
target = 'verb'
svo_sim.groupby(target).size().sort_values()/svo_sim.shape[0]

## Attic: Exploring GS11

In [None]:
svo_sim = gs11.reset_index()

In [None]:
svo_sim.hilo = (svo_sim=='HIGH').astype(int)

In [None]:
cols_ordered = ["subject", "verb", "landmark", "object", "input", "hilo"]

In [None]:
svo_sim[cols_ordered].head()

In [None]:
svo_sim.groupby('landmark').size().sort_values(ascending=False)

In [None]:
svo_sim.groupby('verb').size().sort_values(ascending=False)

In [None]:
svo_sim.describe(percentiles=[])

In [None]:
svo_sim.corr(method='spearman')