In [41]:
from collections import defaultdict
import numpy as np
import operator
import os
import pandas as pd
import pickle

import logging
logging.basicConfig(level=logging.DEBUG,
        format='%(levelname)-8s [%(lineno)d] %(message)s')

# [SimVerb](http://people.ds.cam.ac.uk/dsg40/simverb.html) (Gerz+ EMNLP 2016)

In [42]:
simverb = pd.read_csv('/mnt/permanent/Language/English/Data/verb-similarity/simverb-3500/SimVerb-3500.txt', sep='\t',
                      header=None, names=['verb1', 'verb2', 'pos', 'sim', 'rel'])

In [43]:
simverb.head()

Unnamed: 0,verb1,verb2,pos,sim,rel
0,take,remove,V,6.81,SYNONYMS
1,walk,trail,V,4.81,COHYPONYMS
2,feed,starve,V,1.49,ANTONYMS
3,shine,polish,V,7.8,SYNONYMS
4,calculate,add,V,5.98,HYPER/HYPONYMS


In [44]:
simverb.shape

(3500, 5)

In [45]:
simverb.groupby('rel').sim.describe(percentiles=[]).sort_values('count', ascending=False)

Unnamed: 0_level_0,count,mean,std,min,50%,max
rel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
NONE,2093.0,3.431276,2.342695,0.0,3.15,9.79
HYPER/HYPONYMS,800.0,6.012525,2.104537,0.5,6.31,9.96
SYNONYMS,306.0,6.78915,2.10449,0.5,7.14,9.96
COHYPONYMS,190.0,4.435526,2.381992,0.0,4.665,9.3
ANTONYMS,111.0,0.977748,1.074232,0.0,0.66,6.04


## Tensor

In [46]:
def sim_verb(weight, rank):
    logging.info(weight)
    _, index = pickle.load(open(
        '/mnt/store/home/makrai/project/verb-tensor/depCC/sparstensr_{}_2.pkl'.format(weight), mode='rb'))
    oov = defaultdict(int)
    def tensor_based_sim(rank):
        ktensor, fit, n_iterations, exectimes = pickle.load(open(
            '/mnt/store/home/makrai/project/verb-tensor/depCC/ktensor_{}_2_{}.pkl'.format(weight, rank), mode='rb'))
        #modes = ['nsubj', 'ROOT', 'dobj']
        ktensor.U[1] /= np.apply_along_axis(np.linalg.norm, 1, ktensor.U[1]).reshape((-1,1))
        def pointwise_prod_more(ser):
            try:
                #return ktensor.lmbda .dot( 
                return ktensor.U[1][index['ROOT'][ser[0]]] .dot( ktensor.U[1][index['ROOT'][ser[1]]])
            except KeyError as e:
                oov[e.args] += 1
                return np.nan
        simverb['tensor_sim_{}'.format(rank)] = simverb[['verb1', 'verb2']].apply(pointwise_prod_more, axis=1)
    #logging.debug(sorted(oov.items(), key=operator.itemgetter(1), reverse=True))
    for exp in range(1, int(np.log2(rank))+1):
        tensor_based_sim(rank=2**exp)
    return simverb.corr()[simverb.columns[-4:]].loc['sim']

In [47]:
for weight in ['log_freq', 'pmi', 'iact_info', 'salience', 'log_dice']: # iact_sali
    print(sim_verb(weight, 16))

INFO     [2] log_freq
DEBUG    [19] []
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
INFO     [2] pmi


tensor_sim_2     0.005956
tensor_sim_4     0.000051
tensor_sim_8    -0.048176
tensor_sim_16    0.107064
Name: sim, dtype: float64


DEBUG    [19] []
INFO     [2] iact_info


tensor_sim_2     0.006581
tensor_sim_4     0.003413
tensor_sim_8     0.107581
tensor_sim_16    0.088540
Name: sim, dtype: float64


DEBUG    [19] []
INFO     [2] salience


tensor_sim_2     0.003671
tensor_sim_4    -0.006376
tensor_sim_8     0.054211
tensor_sim_16    0.108490
Name: sim, dtype: float64


DEBUG    [19] []
INFO     [2] log_dice


tensor_sim_2     0.005952
tensor_sim_4    -0.001152
tensor_sim_8    -0.045214
tensor_sim_16    0.125190
Name: sim, dtype: float64


DEBUG    [19] []


tensor_sim_2     0.023153
tensor_sim_4     0.073738
tensor_sim_8    -0.022537
tensor_sim_16    0.050693
Name: sim, dtype: float64


In [48]:
simverb.groupby('rel').corr().sim

rel                          
ANTONYMS        sim              1.000000
                tensor_sim_2     0.017516
                tensor_sim_4    -0.197749
                tensor_sim_8    -0.020607
                tensor_sim_16   -0.084390
COHYPONYMS      sim              1.000000
                tensor_sim_2     0.043319
                tensor_sim_4     0.079444
                tensor_sim_8    -0.032908
                tensor_sim_16   -0.063021
HYPER/HYPONYMS  sim              1.000000
                tensor_sim_2    -0.007254
                tensor_sim_4    -0.002763
                tensor_sim_8    -0.043962
                tensor_sim_16    0.067686
NONE            sim              1.000000
                tensor_sim_2     0.020203
                tensor_sim_4     0.067586
                tensor_sim_8    -0.033773
                tensor_sim_16    0.029902
SYNONYMS        sim              1.000000
                tensor_sim_2    -0.030147
                tensor_sim_4    -0.075950
    

# [Transitive sentence similarity](http://www.cs.ox.ac.uk/activities/compdistmeaning/GS2011data.txt) (Grefenstette and Sadrzadeh, EMNLP 2011)

In [2]:
svo_sim = pd.read_csv('/mnt/permanent/Language/English/Data/verb-similarity/GS2011data.txt', sep=' ', 
                 true_values=['HIGH'], false_values=['LOW'])

In [3]:
cols_ordered = ["subject", "verb", "landmark", "object", "input", "hilo", "participant"]

In [4]:
svo_sim[cols_ordered].head()

Unnamed: 0,subject,verb,landmark,object,input,hilo,participant
0,family,provide,supply,home,4,True,participant20
1,government,provide,leave,cash,3,False,participant20
2,government,provide,supply,cash,7,True,participant20
3,man,provide,leave,money,6,True,participant20
4,man,provide,supply,money,7,True,participant20


In [5]:
svo_sim.describe(percentiles=[])

Unnamed: 0,input
count,2500.0
mean,3.5708
std,2.193561
min,0.0
50%,3.0
max,7.0


In [6]:
svo_sim.corr(method='spearman')

Unnamed: 0,input,hilo
input,1.0,0.51689
hilo,0.51689,1.0


## Tensor

In [36]:
def predict_verb(weight, rank):
    _, index = pickle.load(open(
        '/mnt/store/home/makrai/project/verb-tensor/depCC/sparstensr_{}_2.pkl'.format(weight), mode='rb'))
    ktensor, fit, n_iterations, exectimes = pickle.load(open(
        '/mnt/store/home/makrai/project/verb-tensor/depCC/ktensor_{}_2_{}.pkl'.format(weight, rank), mode='rb'))
    #modes = ['nsubj', 'ROOT', 'dobj']
    #ktensor.U[1] /= np.apply_along_axis(np.linalg.norm, 1, ktensor.U[1]).reshape((-1,1))
    oov = defaultdict(int)
    def verb_pred(ser, prec_at=10):
        try:
            predicted_ids = np.argsort((
                -ktensor.lmbda * ktensor.U[0][index['nsubj'][ser[0]]]) .dot(
                (ktensor.U[1] * ktensor.U[2][index['dobj'][ser[0]]]).T))#] # 
            return [index['ROOT'].inverse[i] for i in predicted_ids[:prec_at]]
        except KeyError as e:
            oov[e.args] += 1
            return []
    svo_sim['predicted_{}_{}'.format(weight, rank)] = svo_sim[['subject', 'object']].apply(verb_pred, axis=1)
    #logging.debug(sorted(oov.items(), key=operator.itemgetter(1), reverse=True))
    for target in ['landmark', 'verb']:
        def is_good(ser):
            return ser[target] in ser['predicted_{}_{}'.format(weight, rank)]
        svo_sim['good_{}_{}_{}'.format(target, weight, rank)] = svo_sim.apply(is_good, axis=1)

In [37]:
def for_weight():
    for weight in ['log_freq', 'pmi', 'iact_info', 'salience', 'log_dice']: # iact_sali
        logging.info(weight)
        for exp in range(1, 5):
            rank = 2 ** exp
            logging.info(rank)
            predict_verb(weight, rank)

In [38]:
%time for_weight()

INFO     [3] log_freq
INFO     [6] 2
INFO     [6] 4
INFO     [6] 8
INFO     [6] 16
INFO     [3] pmi
INFO     [6] 2
INFO     [6] 4
INFO     [6] 8
INFO     [6] 16
INFO     [3] iact_info
INFO     [6] 2
INFO     [6] 4
INFO     [6] 8
INFO     [6] 16
INFO     [3] salience
INFO     [6] 2
INFO     [6] 4
INFO     [6] 8
INFO     [6] 16
INFO     [3] log_dice
INFO     [6] 2
INFO     [6] 4
INFO     [6] 8
INFO     [6] 16


CPU times: user 13min 44s, sys: 39min 48s, total: 53min 33s
Wall time: 5min 11s


In [None]:
cols_ordered = ["subject", "verb", "landmark", "object", "input", "hilo", "participant", 'predicted', 'good']

In [39]:
for target in ['landmark', 'verb']:
    print(target)
    print(svo_sim.groupby(target).size().describe(percentiles=[]))

landmark
count     20.000000
mean     125.000000
std        4.588315
min      120.000000
50%      125.000000
max      130.000000
dtype: float64
verb
count     10.00000
mean     250.00000
std        9.42809
min      240.00000
50%      250.00000
max      260.00000
dtype: float64


In [40]:
svo_sim.sum(numeric_only=True).sort_values(ascending=False)

input                         8927.0
hilo                          1163.0
good_verb_pmi_16               246.0
good_verb_iact_info_16         246.0
good_verb_iact_info_4          242.0
good_verb_pmi_4                194.0
good_verb_pmi_8                170.0
good_verb_log_freq_8           167.0
good_verb_salience_8           167.0
good_verb_iact_info_8          120.0
good_verb_log_freq_16          102.0
good_verb_log_dice_16          102.0
good_verb_salience_16          102.0
good_verb_log_dice_4            78.0
good_landmark_iact_info_16      75.0
good_landmark_pmi_16            62.0
good_landmark_log_freq_8        51.0
good_landmark_salience_8        51.0
good_verb_log_dice_8            50.0
good_landmark_log_dice_4        38.0
good_landmark_pmi_8             37.0
good_landmark_salience_16       37.0
good_landmark_log_dice_2        36.0
good_landmark_log_freq_16       26.0
good_landmark_salience_2        24.0
good_landmark_log_freq_2        24.0
good_landmark_log_dice_8        24.0
g

|verb|`* 1`|`* lmbda`|
|----|--|------|
|unnorm|130|**272**|
|norm|0|24|

Majoroty baseline...

In [None]:
svo_sim[cols_ordered].sample()

# [Verb prediction (tensor)](https://everest.hds.utc.fr/doku.php?id=en:lfmnips12) (Jenatton+ NIPS 2012)

In [None]:
dataset_dir = '/mnt/permanent/Language/English/Data/verb-similarity/SVO-tensor-dataset/'
def get_index(pos):
    with open(os.path.join(dataset_dir, 'svo-{}s.lst'.format(pos))) as infile:
        return {i+1: ' '.join(line.strip().split('_')[2:-1]) for i, line in enumerate(infile)}

In [None]:
svo_df = pd.read_csv(os.path.join(dataset_dir, 'svo_data_train_1000000.dat'), sep='\t', header=None, 
                     names=['subject', 'verb', 'object'])

In [None]:
index_verb = get_index('verb')
index_noun = get_index('noun')

In [None]:
svo_df.subject = svo_df.subject.apply(index_noun.get)
svo_df.verb = svo_df.verb.apply(index_verb.get)
svo_df.object = svo_df.object.apply(index_noun.get)

In [None]:
svo_df.head(10)