In [171]:
from collections import defaultdict
import numpy as np
import operator
import os
import pandas as pd
import pickle

import logging
logging.basicConfig(level=logging.DEBUG,
        format='%(levelname)-8s [%(lineno)d] %(message)s')

In [46]:
sparse_tensor, index = pickle.load(open(
    '/mnt/store/home/makrai/project/verb-tensor/depCC/sparstensr_freq_2.pkl', mode='rb'))

# [Transitive sentence similarity](http://www.cs.ox.ac.uk/activities/compdistmeaning/GS2011data.txt) (Grefenstette and Sadrzadeh, EMNLP 2011)

In [169]:
svo_sim = pd.read_csv('/mnt/permanent/Language/English/Data/verb-similarity/GS2011data.txt', sep=' ', 
                 true_values=['HIGH'], false_values=['LOW'])

In [354]:
cols_ordered = ["subject", "verb", "landmark", "object", "input", "hilo", "participant"]

In [351]:
svo_sim[cols_ordered].head()

Unnamed: 0,subject,verb,landmark,object,input,hilo,participant
0,family,provide,supply,home,4,True,participant20
1,government,provide,leave,cash,3,False,participant20
2,government,provide,supply,cash,7,True,participant20
3,man,provide,leave,money,6,True,participant20
4,man,provide,supply,money,7,True,participant20


In [21]:
svo_sim.corr()

Unnamed: 0,input,hilo
input,1.0,0.524785
hilo,0.524785,1.0


## Tensor

In [274]:
ktensor, fit, n_iterations, exectimes = pickle.load(open(
    '/mnt/store/home/makrai/project/verb-tensor/depCC/ktensor_freq_2_{}.pkl'.format(32), mode='rb'))
#modes = ['nsubj', 'ROOT', 'dobj']
#ktensor.U[1] /= np.apply_along_axis(np.linalg.norm, 1, ktensor.U[1]).reshape((-1,1))


In [361]:
oov = defaultdict(int)
def verb_pred(ser):
    try:
        predicted_ids = np.argsort((-ktensor.lmbda * 
                                    ktensor.U[0][index['nsubj'][ser[0]]]) .dot(
            (ktensor.U[1] * ktensor.U[2][index['dobj'][ser[0]]]).T))#] # 
        return [index['ROOT'].inverse[i] for i in predicted_ids][:10]
    except KeyError as e:
        oov[e.args] += 1
        return []

In [362]:
%time svo_sim['predicted'] = svo_sim[['subject', 'object']].apply(verb_pred, axis=1)

CPU times: user 7min 53s, sys: 20min 53s, total: 28min 46s
Wall time: 1min 58s


In [363]:
sorted(oov.items(), key=operator.itemgetter(1), reverse=True)

[(('tribunal',), 26),
 (('runway',), 26),
 (('cinema',), 24),
 (('spokesman',), 24)]

In [364]:
cols_ordered += ['predicted']

def is_good(ser):
    return ser['verb'] in ser['predicted'] or ser['landmark'] in ser['predicted']
        
svo_sim[svo_sim.apply(is_good, axis=1)][cols_ordered].groupby('verb').size()

verb
accept      60
provide    134
show       120
dtype: int64

# [Verb prediction (tensor)](https://everest.hds.utc.fr/doku.php?id=en:lfmnips12) (Jenatton+ NIPS 2012)

In [185]:
dataset_dir = '/mnt/permanent/Language/English/Data/verb-similarity/SVO-tensor-dataset/'
def get_index(pos):
    with open(os.path.join(dataset_dir, 'svo-{}s.lst'.format(pos))) as infile:
        return {i+1: ' '.join(line.strip().split('_')[2:-1]) for i, line in enumerate(infile)}

In [193]:
svo_df = pd.read_csv(os.path.join(dataset_dir, 'svo_data_train_1000000.dat'), sep='\t', header=None, 
                     names=['subject', 'verb', 'object'])

In [187]:
index_verb = get_index('verb')
index_noun = get_index('noun')

In [194]:
svo_df.subject = svo_df.subject.apply(index_noun.get)
svo_df.verb = svo_df.verb.apply(index_verb.get)
svo_df.object = svo_df.object.apply(index_noun.get)

In [200]:
svo_df.head(10)

Unnamed: 0,subject,verb,object
0,man,swipe,credit card
1,george westinghouse,illuminate,exposition
2,personality,vanish,moment
3,fable,highlight,role
4,secretion,call,surfactant
5,device,store,multimedia
6,diversity,base,engineering
7,troop,destroy,weather station
8,flute,include,vein
9,complication,relate,emphysema


# [SimVerb](http://people.ds.cam.ac.uk/dsg40/simverb.html) (Gerz+ EMNLP 2016)

In [205]:
simverb = pd.read_csv('/mnt/permanent/Language/English/Data/verb-similarity/simverb-3500/SimVerb-3500.txt', sep='\t',
                      header=None, names=['verb1', 'verb2', 'pos', 'sim', 'rel'])

In [206]:
simverb.head()

Unnamed: 0,verb1,verb2,pos,sim,rel
0,take,remove,V,6.81,SYNONYMS
1,walk,trail,V,4.81,COHYPONYMS
2,feed,starve,V,1.49,ANTONYMS
3,shine,polish,V,7.8,SYNONYMS
4,calculate,add,V,5.98,HYPER/HYPONYMS


In [207]:
simverb.shape

(3500, 5)

In [208]:
simverb.groupby('rel').sim.describe(percentiles=[]).sort_values('count', ascending=False)

Unnamed: 0_level_0,count,mean,std,min,50%,max
rel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
NONE,2093.0,3.431276,2.342695,0.0,3.15,9.79
HYPER/HYPONYMS,800.0,6.012525,2.104537,0.5,6.31,9.96
SYNONYMS,306.0,6.78915,2.10449,0.5,7.14,9.96
COHYPONYMS,190.0,4.435526,2.381992,0.0,4.665,9.3
ANTONYMS,111.0,0.977748,1.074232,0.0,0.66,6.04


## Tensor

In [209]:
sparse_tensor, index = pickle.load(open(
    '/mnt/store/home/makrai/project/verb-tensor/depCC/sparstensr_freq_2.pkl', mode='rb'))

In [222]:
def corr(rank=32):
    ktensor, fit, n_iterations, exectimes = pickle.load(open(
        '/mnt/store/home/makrai/project/verb-tensor/depCC/ktensor_freq_2_{}.pkl'.format(rank), mode='rb'))
    #modes = ['nsubj', 'ROOT', 'dobj']
    ktensor.U[1] /= np.apply_along_axis(np.linalg.norm, 1, ktensor.U[1]).reshape((-1,1))
    oov = defaultdict(int)
    def tensor_sim(ser):
        try:
            return ktensor.lmbda .dot( ktensor.U[1][index['ROOT'][ser[0]]] * ktensor.U[1][index['ROOT'][ser[1]]])
        except KeyError as e:
            oov[e.args] += 1
            return np.nan
    simverb['tensor_sim_{}'.format(rank)] = simverb[['verb1', 'verb2']].apply(tensor_sim, axis=1)

In [211]:
sorted(oov.items(), key=operator.itemgetter(1), reverse=True)

[(('yearn',), 8),
 (('decay',), 7),
 (('despair',), 6),
 (('barter',), 5),
 (('implode',), 4),
 (('perspire',), 3),
 (('croak',), 3),
 (('abstain',), 3),
 (('misspend',), 2),
 (('conspire',), 2),
 (('capitulate',), 1)]

In [223]:
for exp in range(1,6):
    corr(rank=2**exp)

  """
  """


In [224]:
simverb.head()

Unnamed: 0,verb1,verb2,pos,sim,rel,tensor_sim_2,tensor_sim_4,tensor_sim_8,tensor_sim_16,tensor_sim_32
0,take,remove,V,6.81,SYNONYMS,4.417431,-0.191475,0.992728,3.775465,0.984036
1,walk,trail,V,4.81,COHYPONYMS,1.291484,0.966819,8.213339,11.994072,1.176555
2,feed,starve,V,1.49,ANTONYMS,2.767968,1.335027,5.412249,11.542765,3.010849
3,shine,polish,V,7.8,SYNONYMS,0.911546,1.539647,2.496265,1.284822,0.39731
4,calculate,add,V,5.98,HYPER/HYPONYMS,1.554247,0.744014,0.951708,-0.201106,1.058874


In [213]:
simverb.describe(percentiles=[])

Unnamed: 0,sim
count,3500.0
mean,4.291554
std,2.652621
min,0.0
50%,4.32
max,9.96


In [225]:
simverb.corr()

Unnamed: 0,sim,tensor_sim_2,tensor_sim_4,tensor_sim_8,tensor_sim_16,tensor_sim_32
sim,1.0,0.042259,0.076794,0.058511,0.126952,0.127369
tensor_sim_2,0.042259,1.0,0.177035,0.286588,0.11692,0.181828
tensor_sim_4,0.076794,0.177035,1.0,0.120886,0.033358,0.066924
tensor_sim_8,0.058511,0.286588,0.120886,1.0,0.646851,0.450475
tensor_sim_16,0.126952,0.11692,0.033358,0.646851,1.0,0.608751
tensor_sim_32,0.127369,0.181828,0.066924,0.450475,0.608751,1.0


In [226]:
simverb.groupby('rel').corr().sim

rel                          
ANTONYMS        sim              1.000000
                tensor_sim_2    -0.045373
                tensor_sim_4    -0.014305
                tensor_sim_8    -0.024783
                tensor_sim_16    0.019699
                tensor_sim_32   -0.008523
COHYPONYMS      sim              1.000000
                tensor_sim_2     0.091303
                tensor_sim_4    -0.025353
                tensor_sim_8     0.045322
                tensor_sim_16    0.032658
                tensor_sim_32    0.047848
HYPER/HYPONYMS  sim              1.000000
                tensor_sim_2    -0.000183
                tensor_sim_4     0.047938
                tensor_sim_8     0.037215
                tensor_sim_16    0.063024
                tensor_sim_32    0.075866
NONE            sim              1.000000
                tensor_sim_2     0.046988
                tensor_sim_4     0.082898
                tensor_sim_8     0.087522
                tensor_sim_16    0.144383
    