In [1]:
from collections import defaultdict
from copy import copy
import numpy as np
import operator
import os
import pandas as pd
import pickle

import logging
logging.basicConfig(level=logging.DEBUG,
        format='%(levelname)-8s [%(lineno)d] %(message)s')

# Task 1: One-mode similarity

## Data: [SimVerb](http://people.ds.cam.ac.uk/dsg40/simverb.html) (Gerz+ EMNLP 2016)

In [6]:
simverb = pd.read_csv('/mnt/permanent/Language/English/Data/verb-similarity/simverb-3500/SimVerb-3500.txt', sep='\t',
                      header=None, names=['verb1', 'verb2', 'pos', 'sim', 'rel'])

In [7]:
simverb.describe(percentiles=[])

Unnamed: 0,sim
count,3500.0
mean,4.291554
std,2.652621
min,0.0
50%,4.32
max,9.96


In [8]:
simverb.head()

Unnamed: 0,verb1,verb2,pos,sim,rel
0,take,remove,V,6.81,SYNONYMS
1,walk,trail,V,4.81,COHYPONYMS
2,feed,starve,V,1.49,ANTONYMS
3,shine,polish,V,7.8,SYNONYMS
4,calculate,add,V,5.98,HYPER/HYPONYMS


In [9]:
simverb.groupby('rel').sim.describe(percentiles=[]).sort_values('count', ascending=False)

Unnamed: 0_level_0,count,mean,std,min,50%,max
rel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
NONE,2093.0,3.431276,2.342695,0.0,3.15,9.79
HYPER/HYPONYMS,800.0,6.012525,2.104537,0.5,6.31,9.96
SYNONYMS,306.0,6.78915,2.10449,0.5,7.14,9.96
COHYPONYMS,190.0,4.435526,2.381992,0.0,4.665,9.3
ANTONYMS,111.0,0.977748,1.074232,0.0,0.66,6.04


## Data: SimLex-999

In [2]:
simlex = pd.read_csv('/mnt/permanent/Language/English/Data/SimLex-999/SimLex-999.txt', sep='\t')

In [3]:
simlex.head()

Unnamed: 0,word1,word2,POS,SimLex999,conc(w1),conc(w2),concQ,Assoc(USF),SimAssoc333,SD(SimLex)
0,old,new,A,1.58,2.72,2.81,2,7.25,1,0.41
1,smart,intelligent,A,9.2,1.75,2.46,1,7.11,1,0.67
2,hard,difficult,A,8.77,3.76,2.21,2,5.94,1,1.19
3,happy,cheerful,A,9.55,2.56,2.34,1,5.85,1,2.18
4,hard,easy,A,0.95,3.76,2.07,2,5.82,1,0.93


* conc(w1): The concreteness rating of word1 on a scale of 1-7. Taken from the University of South Florida Free Association Norms database. 

* conc(w2): The concreteness rating of word2 on a scale of 1-7. Taken from the University of South Florida Free Association Norms database. 

* concQ: The quartile the pair occupies based on the two concreteness ratings. Used for some analyses in the above paper. 

* Assoc(USF): The strength of free association from word1 to word2. Values are taken from the University of South Florida Free Association Dataset. 

* SimAssoc333: Binary indicator of whether the pair is one of the 333 most associated in the dataset (according to Assoc(USF)). This subset of SimLex999 is often the hardest for computational models to capture because the noise from high association can confound the similarity rating. See the paper for more details. 

* SD(SimLex): The standard deviation of annotator scores when rating this pair. Low values indicate good agreement between the 15+ annotators on the similarity value SimLex999. Higher scores indicate less certainty. 


In [4]:
simlex.describe(percentiles=[])

Unnamed: 0,SimLex999,conc(w1),conc(w2),concQ,Assoc(USF),SimAssoc333,SD(SimLex)
count,999.0,999.0,999.0,999.0,999.0,999.0,999.0
mean,4.561572,3.657087,3.568629,2.501502,0.751512,0.333333,1.274505
std,2.614663,1.13105,1.159572,1.118145,1.344569,0.471641,0.366278
min,0.23,1.19,1.19,1.0,0.0,0.0,0.34
50%,4.67,3.83,3.66,3.0,0.25,0.0,1.31
max,9.8,5.0,5.0,4.0,8.85,1.0,2.18


In [5]:
simlex.groupby('POS').size()

POS
A    111
N    666
V    222
dtype: int64

## Testing the verb tensor

In [121]:
def test_sim(weight, cutoff=2, max_rank=128, verb=True, normalize=True, lmbda=False):
    _, index = pickle.load(open(
        '/mnt/store/home/makrai/project/verb-tensor/just_svo/depCC/sparstensr_{}_{}.pkl'.format(weight, cutoff),
        mode='rb'))
    target_df = copy(simverb if verb else simlex)
    target_cols = ['verb1', 'verb2'] if verb else ['word1', 'word2']
    relation = 'ROOT' if verb else 'nsubj'
    mode = 1 if verb else 0
    def tensor_based_sim(rank):
        oov = defaultdict(int)
        ktensor, fit, n_iterations, exectimes = pickle.load(open(
            '/mnt/store/home/makrai/project/verb-tensor/just_svo/depCC/ktensor_{}_{}_{}.pkl'.format(
                weight, cutoff, rank),
            mode='rb'))
        #modes = ['nsubj', 'ROOT', 'dobj']
        if lmbda:
            sq_lam = np.sqrt(np.apply_along_axis(np.linalg.norm, 0, ktensor.lmbda))
            ktensor.U[mode] *= sq_lam
        if normalize:
            ktensor.U[mode] /= np.apply_along_axis(np.linalg.norm, 1, ktensor.U[mode]).reshape((-1,1))
        def pointwise_prod_more(ser):
            try:
                # TODO add option for lmbda
                return ktensor.U[mode][index[relation][ser[0]]].dot(ktensor.U[mode][index[relation][ser[1]]])
            except KeyError as e:
                oov[e.args] += 1
                # TODO 0?
                return 0
        target_df['tensor_sim_{}'.format(rank)] = target_df[target_cols].apply(pointwise_prod_more, axis=1)
        #logging.debug(sorted(oov.items(), key=operator.itemgetter(1), reverse=True)[:6])
    for exp in range(1, int(np.log2(max_rank))+1):
        rank = 2**exp
        try:
            tensor_based_sim(rank=rank)
        except FileNotFoundError as e:
            logging.warning(e)
            target_df['tensor_sim_{}'.format(rank)] = 0
    sim_col = 'sim' if verb else 'SimLex999'
    print(target_df.corr(method='spearman').loc[sim_col][0 if verb else 7 :])
    return target_df

In [127]:
for weight in ['log_freq', 'pmi', 'iact_info', 'salience', 'iact_sali', 'log_dice']:
    print('\n{}'.format(weight))
    test_sim(weight, cutoff=2, max_rank=64)


log_freq




sim              1.000000
tensor_sim_2     0.034061
tensor_sim_4     0.044885
tensor_sim_8    -0.019295
tensor_sim_16    0.040635
tensor_sim_32    0.031303
tensor_sim_64    0.041188
Name: sim, dtype: float64

pmi
sim              1.000000
tensor_sim_2    -0.019486
tensor_sim_4    -0.019602
tensor_sim_8     0.049235
tensor_sim_16    0.040779
tensor_sim_32    0.017285
tensor_sim_64    0.089544
Name: sim, dtype: float64

iact_info
sim              1.000000
tensor_sim_2     0.033541
tensor_sim_4    -0.026909
tensor_sim_8     0.040191
tensor_sim_16   -0.004658
tensor_sim_32    0.087713
tensor_sim_64    0.076685
Name: sim, dtype: float64

salience
sim              1.000000
tensor_sim_2    -0.019706
tensor_sim_4    -0.019776
tensor_sim_8     0.050274
tensor_sim_16    0.059199
tensor_sim_32    0.086605
tensor_sim_64    0.068421
Name: sim, dtype: float64

iact_sali
sim              1.000000
tensor_sim_2     0.030620
tensor_sim_4     0.043721
tensor_sim_8    -0.006364
tensor_sim_16    0.039760
t



sim              1.000000
tensor_sim_2     0.035322
tensor_sim_4     0.024823
tensor_sim_8     0.021778
tensor_sim_16    0.035952
tensor_sim_32    0.046634
tensor_sim_64         NaN
Name: sim, dtype: float64


|weight |rank |sim |
|--|--|--|
|log(freq)|16|0.107064|
|pmi|8|0.107581|
|iact info|16|0.108490|
|salience|16|0.125190|

In [126]:
simverb.groupby('rel').corr(method='spearman').sim

rel                
ANTONYMS        sim    1.0
COHYPONYMS      sim    1.0
HYPER/HYPONYMS  sim    1.0
NONE            sim    1.0
SYNONYMS        sim    1.0
Name: sim, dtype: float64

# [Transitive sentence similarity](http://www.cs.ox.ac.uk/activities/compdistmeaning/GS2011data.txt) (Grefenstette and Sadrzadeh, EMNLP 2011)

## The dataset

In [43]:
svo_sim = pd.read_csv('/mnt/permanent/Language/English/Data/verb-similarity/GS2011data.txt',
                      sep=' ', true_values=['HIGH'], false_values=['LOW'])

In [44]:
cols_ordered = ["subject", "verb", "landmark", "object", "input", "hilo", "participant"]

In [45]:
svo_sim[cols_ordered].head()

Unnamed: 0,subject,verb,landmark,object,input,hilo,participant
0,family,provide,supply,home,4,True,participant20
1,government,provide,leave,cash,3,False,participant20
2,government,provide,supply,cash,7,True,participant20
3,man,provide,leave,money,6,True,participant20
4,man,provide,supply,money,7,True,participant20


In [112]:
svo_sim.groupby('verb').size().sort_values(ascending=False)

verb
write      260
try        260
run        260
meet       260
provide    250
buy        250
show       240
say        240
draw       240
accept     240
dtype: int64

In [113]:
svo_sim.groupby('landmark').size().sort_values(ascending=False)

landmark
visit       130
publish     130
judge       130
move        130
test        130
operate     130
satisfy     130
spell       130
purchase    125
leave       125
supply      125
bribe       125
receive     120
picture     120
state       120
express     120
depict      120
bear        120
attract     120
allege      120
dtype: int64

In [46]:
svo_sim.describe(percentiles=[])

Unnamed: 0,input
count,2500.0
mean,3.5708
std,2.193561
min,0.0
50%,3.0
max,7.0


In [47]:
svo_sim.corr(method='spearman')

Unnamed: 0,input,hilo
input,1.0,0.51689
hilo,0.51689,1.0


## Testing the verb tensor

In [104]:
def predict_verb(weight, rank, cutoff=2, prec_at=5):
    _, index = pickle.load(open(
        '/mnt/store/home/makrai/project/verb-tensor/depCC/sparstensr_{}_{}.pkl'.format(weight, cutoff), mode='rb'))
    ktensor, fit, n_iterations, exectimes = pickle.load(open(
        '/mnt/store/home/makrai/project/verb-tensor/depCC/ktensor_{}_{}_{}.pkl'.format(weight, cutoff, rank), 
        mode='rb'))
    # modes are ['nsubj', 'ROOT', 'dobj'].
    oov = defaultdict(int)
    def verb_pred(ser):
        try:
            predicted_ids = np.argsort((
                -ktensor.lmbda * ktensor.U[0][index['nsubj'][ser[0]]]) .dot(
                (ktensor.U[1] * ktensor.U[2][index['dobj'][ser[1]]]).T))
            return [index['ROOT'].inverse[i] for i in predicted_ids[:prec_at]]
        except KeyError as e:
            oov[e.args] += 1
            return []
    svo_sim['predicted_{}_{}'.format(weight, rank)] = svo_sim[['subject', 'object']].apply(verb_pred, axis=1)
    #logging.debug(sorted(oov.items(), key=operator.itemgetter(1), reverse=True))
    for target in ['landmark', 'verb']:
        def is_good(ser):
            return ser[target] in ser['predicted_{}_{}'.format(weight, rank)]
        svo_sim['good_{}_{}_{}'.format(target, weight, rank)] = svo_sim.apply(is_good, axis=1)
        logging.debug((target, weight, rank, svo_sim['good_{}_{}_{}'.format(target, weight, rank)].sum()))

In [133]:
predict_verb('pmi', 32)

DEBUG    [25] ('landmark', 'iact_sali', 64, 38)
DEBUG    [25] ('verb', 'iact_sali', 64, 26)


In [105]:
def for_weight(max_exp_plus_one=7):
    for weight in ['log_freq', 'pmi', 'iact_info', 'salience', 'iact_sali', 'log_dice']: # iact_sali
        logging.info(weight)
        for exp in range(1, max_exp_plus_one):
            rank = 2 ** exp
            logging.info(rank)
            predict_verb(weight, rank)

In [106]:
%time for_weight()

INFO     [3] log_freq
INFO     [6] 2
DEBUG    [25] ('landmark', 'log_freq', 2, 12)
DEBUG    [25] ('verb', 'log_freq', 2, 0)
INFO     [6] 4
DEBUG    [25] ('landmark', 'log_freq', 4, 0)
DEBUG    [25] ('verb', 'log_freq', 4, 24)
INFO     [6] 8
DEBUG    [25] ('landmark', 'log_freq', 8, 25)
DEBUG    [25] ('verb', 'log_freq', 8, 115)
INFO     [6] 16
DEBUG    [25] ('landmark', 'log_freq', 16, 0)
DEBUG    [25] ('verb', 'log_freq', 16, 113)
INFO     [6] 32
DEBUG    [25] ('landmark', 'log_freq', 32, 0)
DEBUG    [25] ('verb', 'log_freq', 32, 0)
INFO     [6] 64
DEBUG    [25] ('landmark', 'log_freq', 64, 13)
DEBUG    [25] ('verb', 'log_freq', 64, 24)
INFO     [3] pmi
INFO     [6] 2
DEBUG    [25] ('landmark', 'pmi', 2, 0)
DEBUG    [25] ('verb', 'pmi', 2, 0)
INFO     [6] 4
DEBUG    [25] ('landmark', 'pmi', 4, 0)
DEBUG    [25] ('verb', 'pmi', 4, 72)
INFO     [6] 8
DEBUG    [25] ('landmark', 'pmi', 8, 0)
DEBUG    [25] ('verb', 'pmi', 8, 74)
INFO     [6] 16
DEBUG    [25] ('landmark', 'pmi', 16, 0)
DEBUG

CPU times: user 25min 13s, sys: 1h 1min, total: 1h 26min 14s
Wall time: 8min 10s


In [51]:
cols_ordered = ["subject", "verb", "landmark", "object", "input", "hilo", "participant", 'predicted', 'good']

In [52]:
for target in ['landmark', 'verb']:
    print(target)
    print(svo_sim.groupby(target).size().describe(percentiles=[]))

landmark
count     20.000000
mean     125.000000
std        4.588315
min      120.000000
50%      125.000000
max      130.000000
dtype: float64
verb
count     10.00000
mean     250.00000
std        9.42809
min      240.00000
50%      250.00000
max      260.00000
dtype: float64


In [114]:
svo_sim.sum(numeric_only=True).sort_values(ascending=False)/svo_sim.shape[0]

input                         3.5708
hilo                          0.4652
good_verb_pmi_32              0.0968
good_verb_pmi_16              0.0888
good_verb_iact_info_16        0.0880
good_verb_iact_info_32        0.0776
good_verb_pmi_64              0.0480
good_verb_iact_info_64        0.0480
good_verb_log_freq_8          0.0460
good_verb_salience_8          0.0460
good_verb_log_freq_16         0.0452
good_verb_log_dice_16         0.0400
good_verb_salience_16         0.0356
good_verb_pmi_8               0.0296
good_verb_pmi_4               0.0288
good_verb_iact_info_4         0.0288
good_verb_iact_info_8         0.0288
good_verb_salience_64         0.0208
good_verb_log_dice_64         0.0104
good_verb_log_dice_4          0.0104
good_verb_salience_32         0.0104
good_verb_log_dice_8          0.0104
good_landmark_log_freq_8      0.0100
good_landmark_log_dice_64     0.0100
good_landmark_iact_info_16    0.0100
good_landmark_salience_8      0.0100
good_landmark_iact_info_64    0.0100
g

|verb|`* 1`|`* lmbda`|
|----|--|------|
|unnorm|130|**272**|
|norm|0|24|

Majoroty baseline...

In [54]:
target = 'landmark'
svo_sim.groupby(target).size().sort_values()/len(svo_sim[target].unique())

landmark
allege      6.00
attract     6.00
bear        6.00
depict      6.00
express     6.00
state       6.00
picture     6.00
receive     6.00
bribe       6.25
supply      6.25
leave       6.25
purchase    6.25
spell       6.50
satisfy     6.50
operate     6.50
test        6.50
move        6.50
judge       6.50
publish     6.50
visit       6.50
dtype: float64

# [Verb prediction (tensor)](https://everest.hds.utc.fr/doku.php?id=en:lfmnips12) (Jenatton+ NIPS 2012)

In [55]:
dataset_dir = '/mnt/permanent/Language/English/Data/verb-similarity/SVO-tensor-dataset/'
def get_index(pos):
    with open(os.path.join(dataset_dir, 'svo-{}s.lst'.format(pos))) as infile:
        return {i+1: ' '.join(line.strip().split('_')[2:-1]) for i, line in enumerate(infile)}

In [56]:
svo_df = pd.read_csv(os.path.join(dataset_dir, 'svo_data_train_1000000.dat'), sep='\t', header=None, 
                     names=['subject', 'verb', 'object'])

In [57]:
index_verb = get_index('verb')
index_noun = get_index('noun')

In [58]:
svo_df.subject = svo_df.subject.apply(index_noun.get)
svo_df.verb = svo_df.verb.apply(index_verb.get)
svo_df.object = svo_df.object.apply(index_noun.get)

In [59]:
svo_df.head(10)

Unnamed: 0,subject,verb,object
0,man,swipe,credit card
1,george westinghouse,illuminate,exposition
2,personality,vanish,moment
3,fable,highlight,role
4,secretion,call,surfactant
5,device,store,multimedia
6,diversity,base,engineering
7,troop,destroy,weather station
8,flute,include,vein
9,complication,relate,emphysema
