In [17]:
from collections import defaultdict
from copy import copy
import numpy as np
import operator
import os
import pandas as pd
import pickle

import logging
logging.basicConfig(level=logging.DEBUG,
        format='%(levelname)-8s [%(lineno)d] %(message)s')

# Task 1: One-mode similarity

## Data: SimLex-999

In [2]:
simlex = pd.read_csv('/mnt/permanent/Language/English/Data/SimLex-999/SimLex-999.txt', sep='\t')

In [3]:
simlex.head()

Unnamed: 0,word1,word2,POS,SimLex999,conc(w1),conc(w2),concQ,Assoc(USF),SimAssoc333,SD(SimLex)
0,old,new,A,1.58,2.72,2.81,2,7.25,1,0.41
1,smart,intelligent,A,9.2,1.75,2.46,1,7.11,1,0.67
2,hard,difficult,A,8.77,3.76,2.21,2,5.94,1,1.19
3,happy,cheerful,A,9.55,2.56,2.34,1,5.85,1,2.18
4,hard,easy,A,0.95,3.76,2.07,2,5.82,1,0.93


* conc(w1): The concreteness rating of word1 on a scale of 1-7. Taken from the University of South Florida Free Association Norms database. 

* conc(w2): The concreteness rating of word2 on a scale of 1-7. Taken from the University of South Florida Free Association Norms database. 

* concQ: The quartile the pair occupies based on the two concreteness ratings. Used for some analyses in the above paper. 

* Assoc(USF): The strength of free association from word1 to word2. Values are taken from the University of South Florida Free Association Dataset. 

* SimAssoc333: Binary indicator of whether the pair is one of the 333 most associated in the dataset (according to Assoc(USF)). This subset of SimLex999 is often the hardest for computational models to capture because the noise from high association can confound the similarity rating. See the paper for more details. 

* SD(SimLex): The standard deviation of annotator scores when rating this pair. Low values indicate good agreement between the 15+ annotators on the similarity value SimLex999. Higher scores indicate less certainty. 


In [4]:
simlex.describe(percentiles=[])

Unnamed: 0,SimLex999,conc(w1),conc(w2),concQ,Assoc(USF),SimAssoc333,SD(SimLex)
count,999.0,999.0,999.0,999.0,999.0,999.0,999.0
mean,4.561572,3.657087,3.568629,2.501502,0.751512,0.333333,1.274505
std,2.614663,1.13105,1.159572,1.118145,1.344569,0.471641,0.366278
min,0.23,1.19,1.19,1.0,0.0,0.0,0.34
50%,4.67,3.83,3.66,3.0,0.25,0.0,1.31
max,9.8,5.0,5.0,4.0,8.85,1.0,2.18


In [5]:
simlex.groupby('POS').size()

POS
A    111
N    666
V    222
dtype: int64

## Data: [SimVerb](http://people.ds.cam.ac.uk/dsg40/simverb.html) (Gerz+ EMNLP 2016)

In [6]:
simverb = pd.read_csv('/mnt/permanent/Language/English/Data/verb-similarity/simverb-3500/SimVerb-3500.txt', sep='\t',
                      header=None, names=['verb1', 'verb2', 'pos', 'sim', 'rel'])

In [7]:
simverb.describe(percentiles=[])

Unnamed: 0,sim
count,3500.0
mean,4.291554
std,2.652621
min,0.0
50%,4.32
max,9.96


In [8]:
simverb.head()

Unnamed: 0,verb1,verb2,pos,sim,rel
0,take,remove,V,6.81,SYNONYMS
1,walk,trail,V,4.81,COHYPONYMS
2,feed,starve,V,1.49,ANTONYMS
3,shine,polish,V,7.8,SYNONYMS
4,calculate,add,V,5.98,HYPER/HYPONYMS


In [9]:
simverb.groupby('rel').sim.describe(percentiles=[]).sort_values('count', ascending=False)

Unnamed: 0_level_0,count,mean,std,min,50%,max
rel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
NONE,2093.0,3.431276,2.342695,0.0,3.15,9.79
HYPER/HYPONYMS,800.0,6.012525,2.104537,0.5,6.31,9.96
SYNONYMS,306.0,6.78915,2.10449,0.5,7.14,9.96
COHYPONYMS,190.0,4.435526,2.381992,0.0,4.665,9.3
ANTONYMS,111.0,0.977748,1.074232,0.0,0.66,6.04


## Testing the verb tensor

In [94]:
def test_sim(weight, max_rank=32, cutoff=2, verb=True, normalize=True):
    _, index = pickle.load(open(
        '/mnt/store/home/makrai/project/verb-tensor/just_svo/depCC/sparstensr_{}_{}.pkl'.format(weight, cutoff),
        mode='rb'))
    target_df = copy(simverb if verb else simlex)
    target_cols = ['verb1', 'verb2'] if verb else ['word1', 'word2']
    relation = 'ROOT' if verb else 'nsubj'
    mode = 1 if verb else 0
    def tensor_based_sim(rank):
        oov = defaultdict(int)
        ktensor, fit, n_iterations, exectimes = pickle.load(open(
            '/mnt/store/home/makrai/project/verb-tensor/just_svo/depCC/ktensor_{}_{}_{}.pkl'.format(
                weight, cutoff, rank),
            mode='rb'))
        #modes = ['nsubj', 'ROOT', 'dobj']
        if normalize:
            ktensor.U[mode] /= np.apply_along_axis(np.linalg.norm, 1, ktensor.U[mode]).reshape((-1,1))
        def pointwise_prod_more(ser):
            try:
                # TODO add option for lmbda
                return ktensor.lmbda.dot(
                    ktensor.U[mode][index[relation][ser[0]]] * ktensor.U[mode][index[relation][ser[1]]])
            except KeyError as e:
                oov[e.args] += 1
                # TODO 0?
                return 0
        target_df['tensor_sim_{}'.format(rank)] = target_df[target_cols].apply(pointwise_prod_more, axis=1)
        #logging.debug(sorted(oov.items(), key=operator.itemgetter(1), reverse=True)[:6])
    for exp in range(1, int(np.log2(max_rank))+1):
        rank = 2**exp
        try:
            tensor_based_sim(rank=rank)
        except Exception as e:
            logging.warning(e)
            target_df['tensor_sim_{}'.format(rank)] = 0
    sim_col = 'sim' if verb else 'SimLex999'
    print(target_df.corr().loc[sim_col][0 if verb else 7 :])
    return index

In [87]:
index['ROOT']['stop'], len(index['ROOT'])


(569, 1037)

In [97]:
for weight in ['log_freq', 'pmi', 'iact_info', 'salience', 'iact_sali', 'log_dice']:
    print('\n{}'.format(weight))
    index = test_sim(weight, cutoff=2, max_rank=32)


log_freq




sim              1.000000
tensor_sim_2     0.034050
tensor_sim_4    -0.023841
tensor_sim_8    -0.004892
tensor_sim_16   -0.084970
tensor_sim_32         NaN
Name: sim, dtype: float64

pmi




sim              1.000000
tensor_sim_2    -0.002435
tensor_sim_4    -0.002194
tensor_sim_8    -0.041352
tensor_sim_16   -0.016240
tensor_sim_32         NaN
Name: sim, dtype: float64

iact_info




sim              1.000000
tensor_sim_2     0.005982
tensor_sim_4    -0.024775
tensor_sim_8     0.028299
tensor_sim_16   -0.096512
tensor_sim_32         NaN
Name: sim, dtype: float64

salience




sim              1.000000
tensor_sim_2    -0.006758
tensor_sim_4    -0.007111
tensor_sim_8     0.044888
tensor_sim_16    0.076273
tensor_sim_32         NaN
Name: sim, dtype: float64

iact_sali




sim              1.000000
tensor_sim_2     0.009355
tensor_sim_4    -0.019242
tensor_sim_8    -0.001272
tensor_sim_16         NaN
tensor_sim_32         NaN
Name: sim, dtype: float64

log_dice




sim              1.000000
tensor_sim_2     0.034188
tensor_sim_4     0.011173
tensor_sim_8     0.002577
tensor_sim_16         NaN
tensor_sim_32         NaN
Name: sim, dtype: float64


In [24]:
target_df.describe(percentiles=[])

Unnamed: 0,sim,tensor_sim_2,tensor_sim_4,tensor_sim_8,tensor_sim_16,tensor_sim_32,tensor_sim_64,tensor_sim_128,tensor_sim_256
count,3500.0,3494.0,3497.0,3497.0,3497.0,3500.0,3500.0,3500.0,3500.0
mean,4.291554,0.001417,0.000315,3.1e-05,-5.4e-05,0.0,0.0,0.0,0.0
std,2.652621,0.083767,0.02992,0.023823,0.002706,0.0,0.0,0.0,0.0
min,0.0,0.0,-1.055305,-0.98742,-0.157671,0.0,0.0,0.0,0.0
50%,4.32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,9.96,4.951466,1.055223,1.0,0.0,0.0,0.0,0.0,0.0


|weight |rank |sim |
|--|--|--|
|log(freq)|16|0.107064|
|pmi|8|0.107581|
|iact info|16|0.108490|
|salience|16|0.125190|

In [None]:
simverb.groupby('rel').corr().sim

# [Transitive sentence similarity](http://www.cs.ox.ac.uk/activities/compdistmeaning/GS2011data.txt) (Grefenstette and Sadrzadeh, EMNLP 2011)

## The dataset

In [None]:
svo_sim = pd.read_csv('/mnt/permanent/Language/English/Data/verb-similarity/GS2011data.txt', sep=' ', 
                 true_values=['HIGH'], false_values=['LOW'])

In [None]:
cols_ordered = ["subject", "verb", "landmark", "object", "input", "hilo", "participant"]

In [None]:
svo_sim[cols_ordered].head()

In [None]:
svo_sim.describe(percentiles=[])

In [None]:
svo_sim.corr(method='spearman')

## Testing the verb tensor

In [None]:
def predict_verb(weight, rank, cutoff=2, prec_at=5):
    _, index = pickle.load(open(
        '/mnt/store/home/makrai/project/verb-tensor/depCC/sparstensr_{}_{}.pkl'.format(weight, cutoff), mode='rb'))
    ktensor, fit, n_iterations, exectimes = pickle.load(open(
        '/mnt/store/home/makrai/project/verb-tensor/depCC/ktensor_{}_{}_{}.pkl'.format(weight, cutoff, rank), 
        mode='rb'))
    #modes = ['nsubj', 'ROOT', 'dobj']
    #ktensor.U[1] /= np.apply_along_axis(np.linalg.norm, 1, ktensor.U[1]).reshape((-1,1))
    oov = defaultdict(int)
    def verb_pred(ser):
        try:
            predicted_ids = np.argsort((
                -ktensor.lmbda * ktensor.U[0][index['nsubj'][ser[0]]]) .dot(
                (ktensor.U[1] * ktensor.U[2][index['dobj'][ser[0]]]).T))#] # 
            return [index['ROOT'].inverse[i] for i in predicted_ids[:prec_at]]
        except KeyError as e:
            oov[e.args] += 1
            return []
    svo_sim['predicted_{}_{}'.format(weight, rank)] = svo_sim[['subject', 'object']].apply(verb_pred, axis=1)
    #logging.debug(sorted(oov.items(), key=operator.itemgetter(1), reverse=True))
    for target in ['landmark', 'verb']:
        def is_good(ser):
            return ser[target] in ser['predicted_{}_{}'.format(weight, rank)]
        svo_sim['good_{}_{}_{}'.format(target, weight, rank)] = svo_sim.apply(is_good, axis=1)
        logging.debug((target, weight, rank, svo_sim['good_{}_{}_{}'.format(target, weight, rank)].sum()))

In [None]:
def for_weight(max_exp_plus_one=7):
    for weight in ['log_freq', 'pmi', 'iact_info', 'salience', 'log_dice']: # iact_sali
        #logging.info(weight)
        for exp in range(1, max_exp_plus_one):
            rank = 2 ** exp
            #logging.info(rank)
            predict_verb(weight, rank)

In [None]:
%time for_weight()

In [None]:
cols_ordered = ["subject", "verb", "landmark", "object", "input", "hilo", "participant", 'predicted', 'good']

In [None]:
for target in ['landmark', 'verb']:
    print(target)
    print(svo_sim.groupby(target).size().describe(percentiles=[]))

In [None]:
svo_sim.sum(numeric_only=True).sort_values(ascending=False)/svo_sim.shape[0]

|verb|`* 1`|`* lmbda`|
|----|--|------|
|unnorm|130|**272**|
|norm|0|24|

Majoroty baseline...

In [None]:
target = 'landmark'
svo_sim.groupby(target).size().sort_values()/len(svo_sim[target].unique())

# [Verb prediction (tensor)](https://everest.hds.utc.fr/doku.php?id=en:lfmnips12) (Jenatton+ NIPS 2012)

In [None]:
dataset_dir = '/mnt/permanent/Language/English/Data/verb-similarity/SVO-tensor-dataset/'
def get_index(pos):
    with open(os.path.join(dataset_dir, 'svo-{}s.lst'.format(pos))) as infile:
        return {i+1: ' '.join(line.strip().split('_')[2:-1]) for i, line in enumerate(infile)}

In [None]:
svo_df = pd.read_csv(os.path.join(dataset_dir, 'svo_data_train_1000000.dat'), sep='\t', header=None, 
                     names=['subject', 'verb', 'object'])

In [None]:
index_verb = get_index('verb')
index_noun = get_index('noun')

In [None]:
svo_df.subject = svo_df.subject.apply(index_noun.get)
svo_df.verb = svo_df.verb.apply(index_verb.get)
svo_df.object = svo_df.object.apply(index_noun.get)

In [None]:
svo_df.head(10)